started: 01Mar2016 last updated: 02Jun2016

start_section

# Start time
Sys.time()
## [1] "2016-06-02 21:34:33 BST"
# Clean-up
rm(list=ls())
graphics.off()

# Set root working folder
library(knitr)
opts_knit$set(root.dir = "/scratch/medgen/scripts/rscripts_05.16")
#setwd("/scratch/medgen/scripts/rscripts_05.16")

#opts_knit$set(root.dir = "C:\\Users\\larion01\\Documents\\GitHub\\R1")
#setwd("C:\\Users\\larion01\\Documents\\GitHub\\R1")

# Required libraries
library(tidyr) # for separate
library(dplyr) # for piping, filter, select etc
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr) # for str_replace_all
library(ggplot2) # for some plots

load_data

load(file="data/s04_filter_by_effect_feb2016.RData")
ls()
##  [1] "alt.rel"         "alt.std"         "alt.str"        
##  [4] "covar.df"        "demographics.df" "dp.rel"         
##  [7] "dp.std"          "dp.str"          "gq.rel"         
## [10] "gq.std"          "gq.str"          "gt.rel"         
## [13] "gt.std"          "gt.str"          "ref.rel"        
## [16] "ref.std"         "ref.str"         "samples.df"     
## [19] "vv.rel"          "vv.std"          "vv.str"
vv.df <- vv.std
gt.mx <- gt.std

rm(vv.std, gt.std, gq.std, dp.std, ref.std, alt.std)
rm(vv.str, gt.str, gq.str, dp.str, ref.str, alt.str)
rm(vv.rel, gt.rel, gq.rel, dp.rel, ref.rel, alt.rel)

ls()
## [1] "covar.df"        "demographics.df" "gt.mx"           "samples.df"     
## [5] "vv.df"
# I do NOT convert_dfs_to_tables because it loses row manes. 
# However, it could be considered later, 
# when row names handling in tables is fixed. 

check_data

dim(covar.df)
## [1] 498  34
str(covar.df)
## 'data.frame':    498 obs. of  34 variables:
##  $ Subject_ID      : int  200054 200491 200565 200698 200958 201046 201558 201921 202026 202236 ...
##  $ setno           : int  382125 204356 360683 226881 357431 374980 201558 201921 213991 385058 ...
##  $ cc              : int  0 0 0 0 0 0 1 1 0 0 ...
##  $ chemo           : int  1 1 0 0 1 1 1 1 1 1 ...
##  $ hormone         : int  0 1 1 0 1 0 0 1 1 0 ...
##  $ chemo_hormone   : Factor w/ 5 levels "","both","chem",..: 3 2 4 5 2 3 3 2 2 3 ...
##  $ chemo_self_mra  : int  1 1 0 0 1 1 1 1 1 1 ...
##  $ hormone_self_mra: int  0 1 1 0 1 0 0 1 1 0 ...
##  $ treatment       : int  1 1 1 0 1 1 1 1 1 1 ...
##  $ ID              : int  2 6 7 9 11 12 16 22 24 26 ...
##  $ labid           : Factor w/ 498 levels "id200054","id200491",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ status          : int  0 0 0 0 0 0 1 1 0 0 ...
##  $ status2         : int  0 0 0 0 0 0 1 1 0 0 ...
##  $ offset          : num  6.41 5.82 5.61 4.03 4.77 ...
##  $ sub_dx_age      : int  46 50 46 44 43 39 45 40 43 36 ...
##  $ XRTBreast       : int  0 0 0 1 0 1 0 0 1 0 ...
##  $ Eigen_1         : num  -0.0079 -0.01062 -0.00539 -0.00906 -0.01094 ...
##  $ Eigen_2         : num  0.0055 0.00414 0.00302 0.00301 0.0023 ...
##  $ Eigen_3         : num  -0.02171 0.01254 0.00368 -0.00655 -0.01389 ...
##  $ Eigen_4         : num  0.00356 -0.00787 -0.01496 -0.01256 -0.00501 ...
##  $ Eigen_5         : num  0.00135 -0.0291 0.00391 0.01357 -0.00526 ...
##  $ dose            : Factor w/ 3 levels "ge 1Gy","ls 1Gy",..: 3 3 3 2 3 2 3 3 2 3 ...
##  $ dsmiss          : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ good_location   : int  1 1 0 1 1 1 0 1 0 0 ...
##  $ Deleterious     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ registry        : Factor w/ 5 levels "de","IA","IR",..: 1 3 3 5 5 4 3 2 2 1 ...
##  $ race            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ age_stratum1    : Factor w/ 5 levels "20to34","35to39",..: 4 4 4 3 3 2 3 3 3 2 ...
##  $ dxyr_stratum    : int  2 2 3 1 1 2 1 2 3 2 ...
##  $ CMF_Only        : int  1 0 0 0 0 1 0 0 1 1 ...
##  $ family_history  : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ sub_dx_age_cat  : int  0 0 0 1 1 1 0 1 1 1 ...
##  $ CMF             : Factor w/ 3 levels "CMF","no","Oth": 1 3 2 2 3 1 3 3 1 1 ...
##  $ XRTBrCHAR       : int  0 0 0 1 0 1 0 0 1 0 ...
covar.df[1:5,1:5]
##   Subject_ID  setno cc chemo hormone
## 1     200054 382125  0     1       0
## 2     200491 204356  0     1       1
## 3     200565 360683  0     0       1
## 4     200698 226881  0     0       0
## 5     200958 357431  0     1       1
dim(demographics.df)
## [1] 505  91
str(demographics.df)
## 'data.frame':    505 obs. of  91 variables:
##  $ Subject_ID            : Factor w/ 503 levels "200054","200491",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ ID.x                  : num  2 6 7 9 11 12 NA NA 24 26 ...
##  $ labid.x               : Factor w/ 498 levels "15582015","19212019",..: 6 7 8 9 10 11 NA NA 12 13 ...
##  $ Eigen_1.x             : num  -0.0079 -0.01062 -0.00539 -0.00906 -0.01094 ...
##  $ Eigen_2.x             : num  0.0055 0.00414 0.00302 0.00301 0.0023 ...
##  $ Eigen_3.x             : num  -0.02171 0.01254 0.00368 -0.00655 -0.01389 ...
##  $ Eigen_4.x             : num  0.00356 -0.00787 -0.01496 -0.01256 -0.00501 ...
##  $ Eigen_5.x             : num  0.00135 -0.0291 0.00391 0.01357 -0.00526 ...
##  $ Phase                 : num  1 1 1 1 1 1 NA NA 1 1 ...
##  $ setno.x               : num  382125 204356 360683 226881 357431 ...
##  $ cc.x                  : int  0 0 0 0 0 0 NA NA 0 0 ...
##  $ rstime                : num  7.42 9.75 6.09 6.25 9.34 7 NA NA 6.09 8.66 ...
##  $ registry.x            : Factor w/ 7 levels "","de","IA","IR",..: 2 4 4 7 7 5 NA NA 3 2 ...
##  $ race.x                : num  0 0 0 0 0 0 NA NA 0 0 ...
##  $ offset.x              : num  6.41 5.82 5.61 4.03 4.77 ...
##  $ DOB                   : num  -5049 -7459 -3916 -5890 -6407 ...
##  $ sub_dx_age.x          : num  46 50 46 44 43 39 NA NA 43 36 ...
##  $ refage                : num  53 59 51 50 52 45 NA NA 48 44 ...
##  $ BMI_age18             : num  20.2 19.7 23.3 19.5 25.8 ...
##  $ BMI_dx                : num  22.8 20.9 23.3 32.6 25.8 ...
##  $ BMI_ref               : num  25.7 22 25.8 32.6 28.1 ...
##  $ hormone_self_mra.x    : num  0 1 1 0 1 0 NA NA 1 0 ...
##  $ chemo_self_mra.x      : num  1 1 0 0 1 1 NA NA 1 1 ...
##  $ treatment.x           : num  1 1 1 0 1 1 NA NA 1 1 ...
##  $ family_history.x      : Factor w/ 3 levels "1+","none","othe": 2 2 2 2 2 2 NA NA 2 2 ...
##  $ rh_age_menarche       : num  13 14 13 13 12 9 NA NA 12 11 ...
##  $ age_menopause_1yrbf_fd: num  -1 48 -1 -1 -1 -1 NA NA -1 -1 ...
##  $ age_1fftp_fd          : num  20 22 0 29 28 0 NA NA 27 24 ...
##  $ Num_ftp_fd            : num  1 2 -1 2 2 -1 NA NA 3 2 ...
##  $ Histology             : Factor w/ 4 levels "lobular","medullar",..: 3 3 3 2 3 3 NA NA 3 3 ...
##  $ Histology1            : Factor w/ 4 levels "lobular","medullar",..: 3 3 3 2 3 3 NA NA 3 3 ...
##  $ Hist_lob_other        : Factor w/ 4 levels "lobular","other",..: 2 2 2 2 2 2 NA NA 2 2 ...
##  $ stage_fd              : num  2 1 1 1 2 2 NA NA 2 2 ...
##  $ er_fd                 : num  1 1 1 4 1 2 NA NA 2 2 ...
##  $ pr_fd                 : num  1 1 1 2 1 2 NA NA 1 0 ...
##  $ histo1_cat            : Factor w/ 9 levels "al          posi",..: 9 3 3 5 3 3 NA NA 3 3 ...
##  $ er1_cat               : Factor w/ 5 levels "negative","own unkn",..: 3 3 3 5 3 1 NA NA 1 1 ...
##  $ pr1_cat               : Factor w/ 7 levels "negative","own 0 Ot",..: 3 3 3 1 3 1 NA NA 3 7 ...
##  $ status.x              : num  0 0 0 0 0 0 NA NA 0 0 ...
##  $ status2.x             : Factor w/ 4 levels "","0","1","h": 2 2 2 2 2 2 NA NA 2 2 ...
##  $ sub_race              : num  0 0 0 0 0 0 NA NA 0 0 ...
##  $ er1_num               : num  1 1 1 NA 1 0 NA NA 0 0 ...
##  $ er1                   : int  1 1 1 NA 1 0 NA NA 0 0 ...
##  $ horm_tmx              : num  0 1 1 0 2 0 NA NA 1 0 ...
##  $ XRTBreast.x           : num  0 0 0 1 0 1 NA NA 1 0 ...
##  $ dose_caseloc          : num  0 0 0 0.96 NA 0.88 NA NA 0.76 0 ...
##  $ good_location.x       : num  1 1 0 1 1 1 NA NA 0 0 ...
##  $ avedose               : num  0 0 0 1.65 NA 1.45 NA NA 0.91 0 ...
##  $ tmx                   : num  0 1 1 0 0 0 NA NA 1 0 ...
##  $ num_preg              : num  1 1 0 1 1 0 NA NA 1 1 ...
##  $ fam_hist              : num  0 0 0 0 0 0 NA NA 0 0 ...
##  $ age_menarche          : int  1 1 1 1 0 0 NA NA 0 0 ...
##  $ lobular               : num  0 0 0 0 0 0 NA NA 0 0 ...
##  $ age_menopause         : int  0 2 0 0 0 0 NA NA 0 0 ...
##  $ conf_miss             : num  0 0 0 0 0 0 NA NA 0 0 ...
##  $ dose_num              : num  0 0 0 1 NA 1 NA NA 1 0 ...
##  $ cmf                   : Factor w/ 6 levels "","\xb0\x9b@",..: 4 6 5 5 4 4 NA NA 4 4 ...
##  $ cmf_012               : num  1 2 0 0 1 1 NA NA 1 1 ...
##  $ setno.y               : int  382125 204356 360683 226881 357431 374980 201558 201921 213991 385058 ...
##  $ cc.y                  : int  0 0 0 0 0 0 1 1 0 0 ...
##  $ chemo                 : int  1 1 0 0 1 1 1 1 1 1 ...
##  $ hormone               : int  0 1 1 0 1 0 0 1 1 0 ...
##  $ chemo_hormone         : Factor w/ 5 levels "","both","chem",..: 3 2 4 5 2 3 3 2 2 3 ...
##  $ chemo_self_mra.y      : int  1 1 0 0 1 1 1 1 1 1 ...
##  $ hormone_self_mra.y    : int  0 1 1 0 1 0 0 1 1 0 ...
##  $ treatment.y           : int  1 1 1 0 1 1 1 1 1 1 ...
##  $ ID.y                  : int  2 6 7 9 11 12 16 22 24 26 ...
##  $ labid.y               : Factor w/ 498 levels "id200054","id200491",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ status.y              : int  0 0 0 0 0 0 1 1 0 0 ...
##  $ status2.y             : int  0 0 0 0 0 0 1 1 0 0 ...
##  $ offset.y              : num  6.41 5.82 5.61 4.03 4.77 ...
##  $ sub_dx_age.y          : int  46 50 46 44 43 39 45 40 43 36 ...
##  $ XRTBreast.y           : int  0 0 0 1 0 1 0 0 1 0 ...
##  $ Eigen_1.y             : num  -0.0079 -0.01062 -0.00539 -0.00906 -0.01094 ...
##  $ Eigen_2.y             : num  0.0055 0.00414 0.00302 0.00301 0.0023 ...
##  $ Eigen_3.y             : num  -0.02171 0.01254 0.00368 -0.00655 -0.01389 ...
##  $ Eigen_4.y             : num  0.00356 -0.00787 -0.01496 -0.01256 -0.00501 ...
##  $ Eigen_5.y             : num  0.00135 -0.0291 0.00391 0.01357 -0.00526 ...
##  $ dose                  : Factor w/ 3 levels "ge 1Gy","ls 1Gy",..: 3 3 3 2 3 2 3 3 2 3 ...
##  $ dsmiss                : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ good_location.y       : int  1 1 0 1 1 1 0 1 0 0 ...
##  $ Deleterious           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ registry.y            : Factor w/ 5 levels "de","IA","IR",..: 1 3 3 5 5 4 3 2 2 1 ...
##  $ race.y                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ age_stratum1          : Factor w/ 5 levels "20to34","35to39",..: 4 4 4 3 3 2 3 3 3 2 ...
##  $ dxyr_stratum          : int  2 2 3 1 1 2 1 2 3 2 ...
##  $ CMF_Only              : int  1 0 0 0 0 1 0 0 1 1 ...
##  $ family_history.y      : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ sub_dx_age_cat        : int  0 0 0 1 1 1 0 1 1 1 ...
##  $ CMF                   : Factor w/ 3 levels "CMF","no","Oth": 1 3 2 2 3 1 3 3 1 1 ...
##  $ XRTBrCHAR             : int  0 0 0 1 0 1 0 0 1 0 ...
demographics.df[1:5,1:5]
##   Subject_ID ID.x  labid.x    Eigen_1.x   Eigen_2.x
## 1     200054    2 id200054 -0.007897666 0.005498053
## 2     200491    6 id200491 -0.010615326 0.004141289
## 3     200565    7 id200565 -0.005393813 0.003017045
## 4     200698    9 id200698 -0.009061714 0.003010147
## 5     200958   11 id200958 -0.010936549 0.002295868
dim(samples.df)
## [1] 512   4
str(samples.df)
## 'data.frame':    512 obs. of  4 variables:
##  $ wes_id   : Factor w/ 512 levels "P1_A01","P1_A02",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ gwas_id  : Factor w/ 510 levels "id200054","id200491",..: 14 405 315 264 67 121 326 251 281 141 ...
##  $ merged_id: Factor w/ 512 levels "P1_A01_id203568",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ filter   : Factor w/ 5 levels "duplicate","low_concordance",..: 5 5 5 5 5 5 5 5 5 5 ...
samples.df[1:5,]
##   wes_id  gwas_id       merged_id filter
## 1 P1_A01 id203568 P1_A01_id203568   pass
## 2 P1_A02 id357807 P1_A02_id357807   pass
## 3 P1_A03 id325472 P1_A03_id325472   pass
## 4 P1_A04 id304354 P1_A04_id304354   pass
## 5 P1_A05 id222648 P1_A05_id222648   pass
dim(vv.df)
## [1] 19798    49
str(vv.df)
## 'data.frame':    19798 obs. of  49 variables:
##  $ MultiAllelic           : logi  NA NA NA NA NA NA ...
##  $ NDA                    : int  1 1 1 2 1 1 1 1 1 1 ...
##  $ TYPE                   : Factor w/ 2 levels "INDEL","SNP": 2 2 2 1 2 2 2 2 2 2 ...
##  $ CHROM                  : Factor w/ 25 levels "1","10","11",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ POS                    : int  881871 883883 883946 892487 897287 897792 979748 987173 989899 1132513 ...
##  $ REF                    : Factor w/ 9189 levels "A","AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGAATAAATAAAAT",..: 4792 6632 2878 2879 2878 2878 1 6632 2878 2878 ...
##  $ ALT                    : Factor w/ 4394 levels "A","AAAAAAAAAAAAAAAC",..: 1 1125 3321 1125 3321 3321 3321 2230 3321 3321 ...
##  $ QUAL                   : num  268 333 429 1770 321 ...
##  $ DP                     : int  14713 15345 17601 15782 13649 12434 11763 18698 16424 17543 ...
##  $ VQSLOD                 : num  18.33 15.58 14.87 2.79 17.59 ...
##  $ FILTER                 : Factor w/ 46 levels "INDEL_DP_LESS_THAN_5120.0",..: 35 35 35 35 35 35 35 35 35 35 ...
##  $ AC                     : int  1 2 1 2 1 1 36 1 1 2 ...
##  $ AF                     : num  0.000978 0.001953 0.000977 0.001953 0.000982 ...
##  $ AN                     : int  1022 1024 1024 1024 1018 1022 1020 1024 1024 1024 ...
##  $ NEGATIVE_TRAIN_SITE    : Factor w/ 1 level "true": NA NA NA NA NA NA NA NA NA NA ...
##  $ POSITIVE_TRAIN_SITE    : Factor w/ 1 level "true": NA NA NA NA NA NA 1 NA NA NA ...
##  $ ALT_frequency_in_1k_90 : Factor w/ 1 level "true": NA NA NA NA NA NA NA NA NA NA ...
##  $ ALT_frequency_in_1k_95 : Factor w/ 1 level "true": NA NA NA NA NA NA NA NA NA NA ...
##  $ ALT_frequency_in_1k_99 : Factor w/ 1 level "true": NA NA NA NA NA NA NA NA NA NA ...
##  $ ALT_frequency_in_1k_100: Factor w/ 1 level "true": NA NA NA NA NA NA NA NA NA NA ...
##  $ SYMBOL                 : Factor w/ 21214 levels "","A1BG","A1CF",..: 11872 11872 11872 11872 9227 9227 617 617 617 19515 ...
##  $ Allele                 : Factor w/ 3662 levels "-","A","AA","AAA",..: 2 839 2726 1 2726 2726 2726 1798 2726 2726 ...
##  $ ALLELE_NUM             : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Consequence            : Factor w/ 95 levels "3_prime_UTR_variant",..: 78 27 27 9 27 78 27 27 27 27 ...
##  $ IMPACT                 : Factor w/ 4 levels "HIGH","LOW","MODERATE",..: 1 3 3 1 3 1 3 3 3 3 ...
##  $ CLIN_SIG               : Factor w/ 95 levels "","association",..: 1 1 1 1 1 1 3 1 1 1 ...
##  $ SIFT                   : Factor w/ 205 levels "","deleterious(0)",..: 1 2 3 1 6 1 3 2 4 4 ...
##  $ PolyPhen               : Factor w/ 1003 levels "","benign(0)",..: 1 938 948 1 999 1 934 1001 993 1001 ...
##  $ Existing_variation     : Factor w/ 563471 levels "","1KG_12_3747324",..: 1 98520 380067 527722 521349 484071 25218 471172 124473 523764 ...
##  $ GMAF                   : Factor w/ 16424 levels "","-:0.0000",..: 1 5980 12791 1 1 1 12918 1 1 1 ...
##  $ AFR_MAF                : Factor w/ 10312 levels "","-:0","-:0&-:0",..: 1 3689 7935 1 1 1 7936 1 1 1 ...
##  $ ASN_MAF                : logi  NA NA NA NA NA NA ...
##  $ EUR_MAF                : Factor w/ 8788 levels "","-:0","-:0&-:0",..: 1 3132 6730 1 1 1 6828 1 1 1 ...
##  $ EAS_MAF                : Factor w/ 8404 levels "","-:0","-:0&-:0",..: 1 3011 6463 1 1 1 6477 1 1 1 ...
##  $ SAS_MAF                : Factor w/ 8543 levels "","-:0","-:0&-:0",..: 1 3033 6541 1 1 1 6591 1 1 1 ...
##  $ AA_MAF                 : Factor w/ 32797 levels "","-:0","-:0&-:0",..: 1 9743 1 2 1 1 25621 1 25480 1 ...
##  $ EA_MAF                 : Factor w/ 33625 levels "","-:0","-:0&-:0",..: 1 10068 1 9 1 1 26573 1 26019 1 ...
##  $ cDNA_position          : Factor w/ 23204 levels "","0-1","1","?-1",..: 5983 5178 4817 12987 18606 21591 7920 16660 17244 5843 ...
##  $ CDS_position           : Factor w/ 20550 levels "","1","?-1","10",..: 5132 4361 4062 10114 14641 17700 6813 14511 15052 4440 ...
##  $ Codons                 : Factor w/ 6876 levels "","-/A","-/AA",..: 2215 4055 2887 4121 2561 2875 4074 5612 823 743 ...
##  $ Protein_position       : Factor w/ 9321 levels "","1","?-1","10",..: 7541 7149 6999 441 2379 3622 8316 2321 2536 7193 ...
##  $ Amino_acids            : Factor w/ 4479 levels "","-/*","*","A",..: 2862 791 3133 627 2808 3076 906 4370 3875 3827 ...
##  $ DISTANCE               : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ STRAND                 : int  -1 -1 -1 -1 1 1 1 1 1 1 ...
##  $ SYMBOL_SOURCE          : Factor w/ 7 levels "","Clone_based_ensembl_gene",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ SIFT_Call              : Factor w/ 4 levels "deleterious",..: NA 1 1 NA 1 NA 1 1 1 1 ...
##  $ SIFT_Score             : num  NA 0 0.01 NA 0.04 NA 0.01 0 0.02 0.02 ...
##  $ PolyPhen_Call          : Factor w/ 4 levels "benign","possibly_damaging",..: NA 3 3 NA 3 NA 3 3 3 3 ...
##  $ PolyPhen_Score         : num  NA 0.936 0.946 NA 0.997 NA 0.932 0.999 0.991 0.999 ...
vv.df[1:5,1:5]
##              MultiAllelic NDA  TYPE CHROM    POS
## var000000170           NA   1   SNP     1 881871
## var000000182           NA   1   SNP     1 883883
## var000000184           NA   1   SNP     1 883946
## var000000217           NA   2 INDEL     1 892487
## var000000239           NA   1   SNP     1 897287
dim(gt.mx)
## [1] 19798   512
class(gt.mx)
## [1] "matrix"
gt.mx[10:25,1:5]
##              P1_A01.GT P1_A02.GT P1_A03.GT P1_A04.GT P1_A05.GT
## var000000764         0         0         0         0         0
## var000001089         0        NA        NA         0         0
## var000001126         0         0         0         0         0
## var000001160         0         0         0         0         0
## var000001342         0         0         0         0         0
## var000001434         0        NA        NA         0         0
## var000001441         0         0         0         1         0
## var000001462         0         0         0         0         0
## var000001507         0         0         0         0         0
## var000001923         0         0         0         0        NA
## var000001963         0         0         0         0         0
## var000002278         0        NA         0         0         0
## var000002279         2        NA         1         1         1
## var000002294         2         0         1         1         1
## var000002426         0         0         0         0         0
## var000002458         0         0         0         0         0
# Check consistence of rownames
sum(rownames(gt.mx) != rownames(vv.df), na.rm=TRUE)
## [1] 0

reshape_variants_annotations

# select relevant variants annotations
variants.df <- 
  vv.df %>% 
  select(SYMBOL, TYPE, CHROM, POS, REF, ALT, AC, AF, AN, 
         Consequence, SIFT, PolyPhen, CLIN_SIG, 
         GMAF, EUR_MAF, ALT_frequency_in_1k_95)

rm(vv.df)

# Explore cases with confusing GMAF/EUR_MAF containing & symbol
nrow(variants.df %>% filter(grepl("&",GMAF)))
## [1] 0
nrow(variants.df %>% filter(grepl("&",EUR_MAF)))
## [1] 97
variants.df %>% 
  filter(grepl("&",EUR_MAF)) %>% 
  select(c(1:6), GMAF, EUR_MAF) %>% 
  top_n(10,EUR_MAF)
##       SYMBOL TYPE CHROM       POS REF ALT     GMAF           EUR_MAF
## 1      RPE65  SNP     1  68910315   C   T T:0.0034           T:0&T:0
## 2      ABCA4  SNP     1  94485278   C   T T:0.0004           T:0&T:0
## 3       CAV3  SNP     3   8775661   C   T T:0.3710 T:0.2684&T:0.2684
## 4       CASR  SNP     3 122003757   G   T T:0.0942 T:0.1451&T:0.1451
## 5        KDR  SNP     4  55979558   C   T T:0.1526 T:0.0944&T:0.0944
## 6      POMT1  SNP     9 134385435   C   T T:0.0787 T:0.1461&T:0.1461
## 7      KRT75  SNP    12  52827608   C   T T:0.1432 T:0.1213&T:0.1213
## 8    RPGRIP1  SNP    14  21790040   G   T T:0.1673 T:0.2366&T:0.2366
## 9  TNFRSF13B  SNP    17  16843666   C   T T:0.0002           T:0&T:0
## 10 C1GALT1C1  SNP     X 119760629   A   T T:0.1475 T:0.1809&T:0.1809
# There are even 3 variants with triplicated values separated by &
variants.df[c(1235, 11405, 15894),c("GMAF", "EUR_MAF")]
##                  GMAF                 EUR_MAF
## var000050038 T:0.0004 T:0.001&T:0.001&T:0.001
## var000449558 A:0.0002             A:0&A:0&A:0
## var000628093 A:0.0008 A:0.002&A:0.002&A:0.002
# Split cases with confusing EUR_MAF containing "&"" symbol
variants.df <- 
  variants.df %>% 
  separate(EUR_MAF, c("EUR_MAF1", "EUR_MAF2"), "&") 
## Warning: Too many values at 3 locations: 1235, 11405, 15894
## Warning: Too few values at 19701 locations: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
## 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...
# All duplicated values are the same
sum(variants.df$EUR_MAF1 != variants.df$EUR_MAF2, na.rm=TRUE)
## [1] 0
# Remove one of the duplicates
variants.df <- 
  variants.df %>% 
  select(-EUR_MAF2)

# split GMAF and EUR_MAF1
variants.df <- 
  variants.df %>% 
  separate(GMAF, c("GMAF_Allele", "GMAF_Fraction"), ":") %>% 
  separate(EUR_MAF1, c("EUR_MAF_Allele", "EUR_MAF_Fraction"), ":") 
## Warning: Too few values at 11392 locations: 1, 4, 5, 6, 8, 9, 10, 12, 19,
## 20, 25, 26, 29, 30, 31, 32, 33, 35, 36, 37, ...
## Warning: Too few values at 11390 locations: 1, 4, 5, 6, 8, 9, 10, 12, 19,
## 20, 25, 26, 29, 30, 31, 32, 33, 35, 36, 37, ...
# Convert columns to numeric etc
as.numeric(variants.df$GMAF_Fraction) -> variants.df$GMAF_Fraction
as.numeric(variants.df$EUR_MAF_Fraction) -> variants.df$EUR_MAF_Fraction

# Change "" to NAs
NA -> variants.df$GMAF_Allele[variants.df$GMAF_Allele == ""]
NA -> variants.df$EUR_MAF_Allele[variants.df$EUR_MAF_Allele == ""]
NA -> variants.df$SIFT[variants.df$SIFT == ""]
NA -> variants.df$PolyPhen[variants.df$PolyPhen == ""]
NA -> variants.df$CLIN_SIG[variants.df$CLIN_SIG == ""]

# Convert chars to factors
as.factor(variants.df$GMAF_Allele) -> variants.df$GMAF_Allele
as.factor(variants.df$EUR_MAF_Allele) -> variants.df$EUR_MAF_Allele

# Change "true" to TRUE
sum(variants.df$ALT_frequency_in_1k_95 == "true", na.rm = TRUE)
## [1] 39
variants.df <- 
  variants.df %>% 
  mutate(frequent_in_1k=as.logical(
    str_replace_all(ALT_frequency_in_1k_95, "true", TRUE))) %>% 
  select(-ALT_frequency_in_1k_95)
sum(variants.df$frequent_in_1k, na.rm=TRUE)
## [1] 39
# Split SIFT
variants.df <- 
  variants.df %>% 
  mutate(SIFT_call=sub("\\(.*\\)","",SIFT)) %>% 
  mutate(SIFT_score=as.numeric(
    sub(".*\\(","", sub("\\)","",SIFT)))) %>% 
  select(-SIFT)

# Split PolyPhen
variants.df <- 
  variants.df %>% 
  mutate(PolyPhen_call=sub("\\(.*\\)","",PolyPhen)) %>% 
  mutate(PolyPhen_score=as.numeric(
    sub(".*\\(","", sub("\\)","",PolyPhen)))) %>% 
  select(-PolyPhen)

# Check variants table
dim(variants.df)
## [1] 19798    20
str(variants.df)
## 'data.frame':    19798 obs. of  20 variables:
##  $ SYMBOL          : Factor w/ 21214 levels "","A1BG","A1CF",..: 11872 11872 11872 11872 9227 9227 617 617 617 19515 ...
##  $ TYPE            : Factor w/ 2 levels "INDEL","SNP": 2 2 2 1 2 2 2 2 2 2 ...
##  $ CHROM           : Factor w/ 25 levels "1","10","11",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ POS             : int  881871 883883 883946 892487 897287 897792 979748 987173 989899 1132513 ...
##  $ REF             : Factor w/ 9189 levels "A","AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGAATAAATAAAAT",..: 4792 6632 2878 2879 2878 2878 1 6632 2878 2878 ...
##  $ ALT             : Factor w/ 4394 levels "A","AAAAAAAAAAAAAAAC",..: 1 1125 3321 1125 3321 3321 3321 2230 3321 3321 ...
##  $ AC              : int  1 2 1 2 1 1 36 1 1 2 ...
##  $ AF              : num  0.000978 0.001953 0.000977 0.001953 0.000982 ...
##  $ AN              : int  1022 1024 1024 1024 1018 1022 1020 1024 1024 1024 ...
##  $ Consequence     : Factor w/ 95 levels "3_prime_UTR_variant",..: 78 27 27 9 27 78 27 27 27 27 ...
##  $ CLIN_SIG        : Factor w/ 95 levels "","association",..: NA NA NA NA NA NA 3 NA NA NA ...
##  $ GMAF_Allele     : Factor w/ 46 levels "-","A","AA","AAAT",..: NA 15 36 NA NA NA 36 NA NA NA ...
##  $ GMAF_Fraction   : num  NA 0.0002 0.0002 NA NA NA 0.0184 NA NA NA ...
##  $ EUR_MAF_Allele  : Factor w/ 47 levels "-","A","AA","AAAT",..: NA 15 36 NA NA NA 36 NA NA NA ...
##  $ EUR_MAF_Fraction: num  NA 0 0 NA NA NA 0.0467 NA NA NA ...
##  $ frequent_in_1k  : logi  NA NA NA NA NA NA ...
##  $ SIFT_call       : chr  NA "deleterious" "deleterious" NA ...
##  $ SIFT_score      : num  NA 0 0.01 NA 0.04 NA 0.01 0 0.02 0.02 ...
##  $ PolyPhen_call   : chr  NA "probably_damaging" "probably_damaging" NA ...
##  $ PolyPhen_score  : num  NA 0.936 0.946 NA 0.997 NA 0.932 0.999 0.991 0.999 ...
variants.df[1:5,1:5]
##   SYMBOL  TYPE CHROM    POS REF
## 1  NOC2L   SNP     1 881871   G
## 2  NOC2L   SNP     1 883883   T
## 3  NOC2L   SNP     1 883946   C
## 4  NOC2L INDEL     1 892487  CA
## 5 KLHL17   SNP     1 897287   C

reshape_covar

dim(covar.df)
## [1] 498  34
colnames(covar.df)
##  [1] "Subject_ID"       "setno"            "cc"              
##  [4] "chemo"            "hormone"          "chemo_hormone"   
##  [7] "chemo_self_mra"   "hormone_self_mra" "treatment"       
## [10] "ID"               "labid"            "status"          
## [13] "status2"          "offset"           "sub_dx_age"      
## [16] "XRTBreast"        "Eigen_1"          "Eigen_2"         
## [19] "Eigen_3"          "Eigen_4"          "Eigen_5"         
## [22] "dose"             "dsmiss"           "good_location"   
## [25] "Deleterious"      "registry"         "race"            
## [28] "age_stratum1"     "dxyr_stratum"     "CMF_Only"        
## [31] "family_history"   "sub_dx_age_cat"   "CMF"             
## [34] "XRTBrCHAR"
attach(covar.df)

# chemo is the same as chemo_self_mra, no missed values
sum(chemo != chemo_self_mra)
## [1] 0
# One missed case in hormone/ hormone_self_mra
sum(is.na(hormone))
## [1] 1
sum(is.na(hormone_self_mra))
## [1] 1
# hormone has 10 differences from hormone_self_mra
sum(hormone != hormone_self_mra, na.rm=TRUE)
## [1] 10
# status is the same as status2, no missed values
sum(status != status2)
## [1] 0
# Race is always 0, no missed data
sum(race)
## [1] 0
# Dismiss split 495 vs 3, no missed data
sum(dsmiss)
## [1] 3
# good_location split 86 vs 412, no missed data
sum(good_location)
## [1] 412
# Deleterious split 480 vs 18, no missed data
sum(Deleterious)
## [1] 18
# Deleterious split 158 vs 340, no missed data
sum(family_history)
## [1] 158
# XRTBrCHAR is the same as XRTBreast, no missed values
sum(XRTBrCHAR != XRTBreast)
## [1] 0
# sub_dx_age_cat split 169 vs 329, no missed data
sum(sub_dx_age_cat)
## [1] 329
# dose
summary(dose)
##  ge 1Gy  ls 1Gy no dose 
##     107     153     238
detach(covar.df)

# Keep only selected annotations
selected_annotations <- c(
  "labid","cc","sub_dx_age","offset",
  "chemo","hormone","XRTBreast","dose",
  "Eigen_1","Eigen_2","Eigen_3","Eigen_4","Eigen_5")

# Keep only selected annotations
covar.df <- covar.df[,selected_annotations]
rm(selected_annotations)

# change XRTBreast to xrt
"xrt" -> colnames(covar.df)[7]

# Check result
dim(covar.df)
## [1] 498  13
str(covar.df)
## 'data.frame':    498 obs. of  13 variables:
##  $ labid     : Factor w/ 498 levels "id200054","id200491",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ cc        : int  0 0 0 0 0 0 1 1 0 0 ...
##  $ sub_dx_age: int  46 50 46 44 43 39 45 40 43 36 ...
##  $ offset    : num  6.41 5.82 5.61 4.03 4.77 ...
##  $ chemo     : int  1 1 0 0 1 1 1 1 1 1 ...
##  $ hormone   : int  0 1 1 0 1 0 0 1 1 0 ...
##  $ xrt       : int  0 0 0 1 0 1 0 0 1 0 ...
##  $ dose      : Factor w/ 3 levels "ge 1Gy","ls 1Gy",..: 3 3 3 2 3 2 3 3 2 3 ...
##  $ Eigen_1   : num  -0.0079 -0.01062 -0.00539 -0.00906 -0.01094 ...
##  $ Eigen_2   : num  0.0055 0.00414 0.00302 0.00301 0.0023 ...
##  $ Eigen_3   : num  -0.02171 0.01254 0.00368 -0.00655 -0.01389 ...
##  $ Eigen_4   : num  0.00356 -0.00787 -0.01496 -0.01256 -0.00501 ...
##  $ Eigen_5   : num  0.00135 -0.0291 0.00391 0.01357 -0.00526 ...
covar.df[1:5,1:5]
##      labid cc sub_dx_age   offset chemo
## 1 id200054  0         46 6.406880     1
## 2 id200491  0         50 5.820083     1
## 3 id200565  0         46 5.609472     0
## 4 id200698  0         44 4.034241     0
## 5 id200958  0         43 4.770685     1

explore_demographics_table

exclude_odd_cases

# Outlook
dim(demographics.df)
## [1] 505  91
colnames(demographics.df)
##  [1] "Subject_ID"             "ID.x"                  
##  [3] "labid.x"                "Eigen_1.x"             
##  [5] "Eigen_2.x"              "Eigen_3.x"             
##  [7] "Eigen_4.x"              "Eigen_5.x"             
##  [9] "Phase"                  "setno.x"               
## [11] "cc.x"                   "rstime"                
## [13] "registry.x"             "race.x"                
## [15] "offset.x"               "DOB"                   
## [17] "sub_dx_age.x"           "refage"                
## [19] "BMI_age18"              "BMI_dx"                
## [21] "BMI_ref"                "hormone_self_mra.x"    
## [23] "chemo_self_mra.x"       "treatment.x"           
## [25] "family_history.x"       "rh_age_menarche"       
## [27] "age_menopause_1yrbf_fd" "age_1fftp_fd"          
## [29] "Num_ftp_fd"             "Histology"             
## [31] "Histology1"             "Hist_lob_other"        
## [33] "stage_fd"               "er_fd"                 
## [35] "pr_fd"                  "histo1_cat"            
## [37] "er1_cat"                "pr1_cat"               
## [39] "status.x"               "status2.x"             
## [41] "sub_race"               "er1_num"               
## [43] "er1"                    "horm_tmx"              
## [45] "XRTBreast.x"            "dose_caseloc"          
## [47] "good_location.x"        "avedose"               
## [49] "tmx"                    "num_preg"              
## [51] "fam_hist"               "age_menarche"          
## [53] "lobular"                "age_menopause"         
## [55] "conf_miss"              "dose_num"              
## [57] "cmf"                    "cmf_012"               
## [59] "setno.y"                "cc.y"                  
## [61] "chemo"                  "hormone"               
## [63] "chemo_hormone"          "chemo_self_mra.y"      
## [65] "hormone_self_mra.y"     "treatment.y"           
## [67] "ID.y"                   "labid.y"               
## [69] "status.y"               "status2.y"             
## [71] "offset.y"               "sub_dx_age.y"          
## [73] "XRTBreast.y"            "Eigen_1.y"             
## [75] "Eigen_2.y"              "Eigen_3.y"             
## [77] "Eigen_4.y"              "Eigen_5.y"             
## [79] "dose"                   "dsmiss"                
## [81] "good_location.y"        "Deleterious"           
## [83] "registry.y"             "race.y"                
## [85] "age_stratum1"           "dxyr_stratum"          
## [87] "CMF_Only"               "family_history.y"      
## [89] "sub_dx_age_cat"         "CMF"                   
## [91] "XRTBrCHAR"
demographics.df[1:10,1:5]
##    Subject_ID ID.x  labid.x    Eigen_1.x    Eigen_2.x
## 1      200054    2 id200054 -0.007897666  0.005498053
## 2      200491    6 id200491 -0.010615326  0.004141289
## 3      200565    7 id200565 -0.005393813  0.003017045
## 4      200698    9 id200698 -0.009061714  0.003010147
## 5      200958   11 id200958 -0.010936549  0.002295868
## 6      201046   12 id201046 -0.010397451  0.004503799
## 7      201558   NA     <NA>           NA           NA
## 8      201921   NA     <NA>           NA           NA
## 9      202026   24 id202026 -0.011427344  0.003990090
## 10     202236   26 id202236 -0.007524541 -0.001414092
# Exclude odd cases
odd_cases=c("201558","201921","202698","21IAno","387460",
            "389192","58IR1+","60IA1+","92IRno","98IAno")

demographics.df %>% 
  filter(Subject_ID %in% odd_cases) %>% 
  select(1:5)
##    Subject_ID           ID.x  labid.x      Eigen_1.x      Eigen_2.x
## 1      201558             NA     <NA>             NA             NA
## 2      201921             NA     <NA>             NA             NA
## 3      202698             NA     <NA>             NA             NA
## 4      21IAno   1.164234e-25 19212019 -2.400941e-267 -1.400366e-103
## 5      387460             NA     <NA>             NA             NA
## 6      389192             NA     <NA>             NA             NA
## 7      58IR1+  -9.238075e+11 15582015  6.560630e-155 -6.845025e-292
## 8      60IA1+ -7.255563e-207 74603874  7.125925e-196  -1.737869e+90
## 9      92IRno  3.394376e+132 91923891  -2.236906e+99   8.169427e-68
## 10     98IAno  7.355810e+306 26982026   7.797358e+36  -1.779783e-65
pheno.df <- demographics.df %>% filter(! Subject_ID %in% odd_cases)
dim(pheno.df)
## [1] 495  91
pheno.df[1:10,1:5]
##    Subject_ID ID.x  labid.x    Eigen_1.x    Eigen_2.x
## 1      200054    2 id200054 -0.007897666  0.005498053
## 2      200491    6 id200491 -0.010615326  0.004141289
## 3      200565    7 id200565 -0.005393813  0.003017045
## 4      200698    9 id200698 -0.009061714  0.003010147
## 5      200958   11 id200958 -0.010936549  0.002295868
## 6      201046   12 id201046 -0.010397451  0.004503799
## 7      202026   24 id202026 -0.011427344  0.003990090
## 8      202236   26 id202236 -0.007524541 -0.001414092
## 9      202718   31 id202718 -0.012475710  0.005576766
## 10     203186   36 id203186 -0.008093648  0.012604182

exclude_duplicated_cases

duplicated_cases <- 
  pheno.df %>% 
  filter(duplicated(Subject_ID)) %>% 
  select(Subject_ID)

duplicated_cases <- as.vector(duplicated_cases[,1])
duplicated_cases
## [1] "259643" "272715"
# Phenotype annotations for the duplicate record are identical
pheno.df %>% filter(Subject_ID %in% duplicated_cases)
##   Subject_ID ID.x  labid.x    Eigen_1.x   Eigen_2.x    Eigen_3.x
## 1     259643  574 id259643 -0.009376837 0.002435931 -0.008098524
## 2     259643  574 id259643 -0.009376837 0.002435931 -0.008098524
## 3     272715  664 id272715 -0.013179119 0.008036325  0.002829476
## 4     272715  664 id272715 -0.013179119 0.008036325  0.002829476
##      Eigen_4.x    Eigen_5.x Phase setno.x cc.x rstime registry.x race.x
## 1 -0.012619313  0.006557315     1  259643    1   3.33         IA      0
## 2 -0.012619313  0.006557315     1  259643    1   3.33         IA      0
## 3  0.004044336 -0.013210921     1  259643    0   3.33         IA      0
## 4  0.004044336 -0.013210921     1  259643    0   3.33         IA      0
##   offset.x   DOB sub_dx_age.x refage BMI_age18   BMI_dx  BMI_ref
## 1 5.384495 -6547           47     50  18.88158 22.31460 22.31460
## 2 5.384495 -6547           47     50  18.88158 22.31460 22.31460
## 3 3.417727 -7883           48     50  23.43605 29.29506 29.29506
## 4 3.417727 -7883           48     50  23.43605 29.29506 29.29506
##   hormone_self_mra.x chemo_self_mra.x treatment.x family_history.x
## 1                  1                0           1             none
## 2                  1                0           1             none
## 3                  0                0           0               1+
## 4                  0                0           0               1+
##   rh_age_menarche age_menopause_1yrbf_fd age_1fftp_fd Num_ftp_fd Histology
## 1              13                     -1           19          2     other
## 2              13                     -1           19          2     other
## 3              12                     -1           25          2     other
## 4              12                     -1           25          2     other
##   Histology1 Hist_lob_other stage_fd er_fd pr_fd histo1_cat  er1_cat
## 1      other          other        1     1     1     ductal positive
## 2      other          other        1     1     1     ductal positive
## 3      other          other        1     2     2     ductal negative
## 4      other          other        1     2     2     ductal negative
##    pr1_cat status.x status2.x sub_race er1_num er1 horm_tmx XRTBreast.x
## 1 positive        1         1        0       1   1        1           0
## 2 positive        1         1        0       1   1        1           0
## 3 negative        0         0        0       0   0        0           1
## 4 negative        0         0        0       0   0        0           1
##   dose_caseloc good_location.x avedose tmx num_preg fam_hist age_menarche
## 1          0.0               1    0.00   1        1        0            1
## 2          0.0               1    0.00   1        1        0            1
## 3          1.4               1    2.22   0        1        1            0
## 4          1.4               1    2.22   0        1        1            0
##   lobular age_menopause conf_miss dose_num cmf cmf_012 setno.y cc.y chemo
## 1       0             0         0        0  no       0  259643    1     0
## 2       0             0         0        0  no       0  259643    1     0
## 3       0             0         0        2  no       0  259643    0     0
## 4       0             0         0        2  no       0  259643    0     0
##   hormone chemo_hormone chemo_self_mra.y hormone_self_mra.y treatment.y
## 1       1          horm                0                  1           1
## 2       1          horm                0                  1           1
## 3       0          none                0                  0           0
## 4       0          none                0                  0           0
##   ID.y  labid.y status.y status2.y offset.y sub_dx_age.y XRTBreast.y
## 1  574 id259643        1         1 5.384495           47           0
## 2  574 id259643        1         1 5.384495           47           0
## 3  664 id272715        0         0 3.417727           48           1
## 4  664 id272715        0         0 3.417727           48           1
##      Eigen_1.y   Eigen_2.y    Eigen_3.y    Eigen_4.y    Eigen_5.y    dose
## 1 -0.009376837 0.002435931 -0.008098524 -0.012619313  0.006557315 no dose
## 2 -0.009376837 0.002435931 -0.008098524 -0.012619313  0.006557315 no dose
## 3 -0.013179119 0.008036325  0.002829476  0.004044336 -0.013210921  ge 1Gy
## 4 -0.013179119 0.008036325  0.002829476  0.004044336 -0.013210921  ge 1Gy
##   dsmiss good_location.y Deleterious registry.y race.y age_stratum1
## 1      0               1           0         IA      0       45to49
## 2      0               1           0         IA      0       45to49
## 3      0               1           0         IA      0       45to49
## 4      0               1           0         IA      0       45to49
##   dxyr_stratum CMF_Only family_history.y sub_dx_age_cat CMF XRTBrCHAR
## 1            1        0                0              0  no         0
## 2            1        0                0              0  no         0
## 3            1        0                1              0  no         1
## 4            1        0                1              0  no         1
pheno.df <- pheno.df %>% filter(!duplicated(Subject_ID))
dim(pheno.df)
## [1] 493  91
pheno.df[1:10,1:5]
##    Subject_ID ID.x  labid.x    Eigen_1.x    Eigen_2.x
## 1      200054    2 id200054 -0.007897666  0.005498053
## 2      200491    6 id200491 -0.010615326  0.004141289
## 3      200565    7 id200565 -0.005393813  0.003017045
## 4      200698    9 id200698 -0.009061714  0.003010147
## 5      200958   11 id200958 -0.010936549  0.002295868
## 6      201046   12 id201046 -0.010397451  0.004503799
## 7      202026   24 id202026 -0.011427344  0.003990090
## 8      202236   26 id202236 -0.007524541 -0.001414092
## 9      202718   31 id202718 -0.012475710  0.005576766
## 10     203186   36 id203186 -0.008093648  0.012604182
attach(pheno.df)

x_y_id_cc_fam

# Case-control status

ID.x[1:5]
## [1]  2  6  7  9 11
sum(ID.x != ID.y)
## [1] 0
cc.x[1:20]
##  [1] 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1 0 0 1 0 0
sum(cc.x != cc.y)
## [1] 0
ggplot(pheno.df) + 
  aes(as.factor(cc.x), fill=as.factor(cc.x)) +
  geom_bar(colour="black", alpha=0.3)

table(cc.x)
## cc.x
##   0   1 
## 247 246
# Lables x vs y

labid.x[1:5]
## [1] id200054 id200491 id200565 id200698 id200958
## 498 Levels: 15582015 19212019 26982026 74603874 91923891 ... id399943
sum(as.vector(labid.x) != as.vector(labid.y))
## [1] 0
# Familial history

fam_hist[1:5]
## [1] 0 0 0 0 0
sum(fam_hist != family_history.y)
## [1] 0
sum(as.numeric(sub("none",0,sub("1\\+",1,family_history.x))) != family_history.y)
## [1] 0
ggplot(pheno.df) + 
  aes(as.factor(family_history.x), fill=as.factor(cc.x)) +
  geom_bar(colour="black", alpha=0.3)

table(family_history.x)
## family_history.x
##   1+ none othe 
##  156  337    0
table(family_history.y)
## family_history.y
##   0   1 
## 337 156
# No disbalance in familial history between cases and controls

ggplot(pheno.df) + 
  aes(as.factor(cc.x), fill=as.factor(family_history.x)) +
  geom_bar(colour="black", alpha=0.3)

table(cc.y, family_history.y)
##     family_history.y
## cc.y   0   1
##    0 178  69
##    1 159  87
fisher.test(table(cc.y, family_history.y))
## 
##  Fisher's Exact Test for Count Data
## 
## data:  table(cc.y, family_history.y)
## p-value = 0.08182
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.9469101 2.1065143
## sample estimates:
## odds ratio 
##   1.410529

x_y_ages

# 1st Ds age (?)

sub_dx_age.x[1:5]
## [1] 46 50 46 44 43
sum(sub_dx_age.x != sub_dx_age.y)
## [1] 0
ggplot(pheno.df) + 
  aes(sub_dx_age.x, fill=as.factor(cc.x)) +
  geom_histogram(colour="black", alpha=0.3, binwidth = 1)

# Consistent with 55 yr selection criteria

# No differences between cases and controls by age
t.test(sub_dx_age.x~cc.x)
## 
##  Welch Two Sample t-test
## 
## data:  sub_dx_age.x by cc.x
## t = -0.29954, df = 490.96, p-value = 0.7647
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.0209136  0.7508106
## sample estimates:
## mean in group 0 mean in group 1 
##        42.22267        42.35772
ggplot(pheno.df) + 
  aes(sub_dx_age.x, fill=as.factor(cc.x)) +
  geom_density(colour="black", alpha=0.3)

# Compare to Refage (age at 2nd ds???)
ggplot(pheno.df, colour="black") + 
  geom_density(aes(refage), fill="red", alpha=0.3) +
  geom_density(aes(sub_dx_age.x), fill="blue", alpha=0.3)

# No differences between cases and controls by refage
t.test(refage~cc.x)
## 
##  Welch Two Sample t-test
## 
## data:  refage by cc.x
## t = -0.038983, df = 490.73, p-value = 0.9689
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.211407  1.164273
## sample estimates:
## mean in group 0 mean in group 1 
##        47.79757        47.82114
ggplot(pheno.df) + 
  aes(refage, fill=as.factor(cc.x)) +
  geom_density(colour="black", alpha=0.3)

# Categorical age codings

dxyr_stratum[1:5]
## [1] 2 2 3 1 1
age_stratum1[1:5]
## [1] 45to49 45to49 45to49 40to44 40to44
## Levels: 20to34 35to39 40to44 45to49 50to54
sub_dx_age_cat[1:5]
## [1] 0 0 0 1 1
# Unknown offset
# consistent with 1+ year between the 1st and 2nd tumours (selection criteria)

offset.x[1:5]
## [1] 6.406880 5.820083 5.609472 4.034241 4.770685
sum(offset.x != offset.y)
## [1] 0
hist(offset.x) 

# Compare to rstime (~age at ds - age at 2nd ds???)
ggplot(pheno.df, colour="black") + 
  geom_density(aes(offset.x), fill="red", alpha=0.3) +
  geom_density(aes(rstime), fill="blue", alpha=0.3)

min(offset.x)
## [1] 1.252763
min(rstime)
## [1] 1

x_y_any_treatment

# Any treatment 
treatment.x[1:5]
## [1] 1 1 1 0 1
treatment.y[1:5]
## [1] 1 1 1 0 1
sum(treatment.x != treatment.y)
## [1] 0
ggplot(pheno.df) + 
  aes(as.factor(treatment.x), fill=as.factor(cc.x)) +
  geom_bar(colour="black", alpha=0.3)

# There was more treatment in controls
table(cc.x, treatment.x)
##     treatment.x
## cc.x   0   1
##    0  89 158
##    1 112 134
fisher.test(table(cc.x, treatment.x))
## 
##  Fisher's Exact Test for Count Data
## 
## data:  table(cc.x, treatment.x)
## p-value = 0.03518
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.4619463 0.9827664
## sample estimates:
## odds ratio 
##  0.6744954
ggplot(pheno.df) + 
  aes(as.factor(cc.x), fill=as.factor(treatment.x)) +
  geom_bar(colour="black", alpha=0.3)

# for comparison (neither will be used later)
chemo_hormone[1:5] 
## [1] chem both horm none both
## Levels:  both chem horm none

x_y_hormonal_treatment

# Some missed data and discrepansies in endocrine treatment
# (when compare hormone, hormone_self_mra.x and hormone_self_mra.y)
hormone[1:5]
## [1] 0 1 1 0 1
hormone_self_mra.x[1:5]
## [1] 0 1 1 0 1
hormone_self_mra.y[1:5]
## [1] 0 1 1 0 1
sum(hormone_self_mra.x != hormone_self_mra.y, na.rm=TRUE)
## [1] 0
sum(hormone != hormone_self_mra.x, na.rm=TRUE)
## [1] 10
sum(is.na(hormone))
## [1] 1
sum(is.na(hormone_self_mra.x))
## [1] 0
sum(is.na(hormone_self_mra.y))
## [1] 1
ggplot(pheno.df) + 
  aes(as.factor(hormone), fill=as.factor(cc.x)) +
  geom_bar(colour="black", alpha=0.3)

# There was no disbalance in cases and controls by hormonal treatment
table(cc.x,hormone)
##     hormone
## cc.x   0   1
##    0 188  58
##    1 196  50
fisher.test(table(cc.x, hormone))
## 
##  Fisher's Exact Test for Count Data
## 
## data:  table(cc.x, hormone)
## p-value = 0.4459
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.5261579 1.2971381
## sample estimates:
## odds ratio 
##  0.8272048
pheno.df %>% 
  filter(!is.na(hormone)) %>% 
  ggplot() + 
    aes(as.factor(cc.x), fill=as.factor(hormone)) +
    geom_bar(colour="black", alpha=0.3)

# Hormone vs er: 
# 64% er+ves were not given hormones:
# old cohort, young patients, cytotoxic instead ...?
# 11 er-ves were hiven hormones??
table(er1, hormone)
##    hormone
## er1   0   1
##   0 101  11
##   1 148  82
sum(is.na(er1))
## [1] 150

x_y_cytotoxic_treatment

# No missed data or discrepansies in cytotoxic treatment
# (when compare chemo, chemo_self_mra.x and chemo_self_mra.y)
chemo[1:5]
## [1] 1 1 0 0 1
sum(chemo_self_mra.x != chemo_self_mra.y)
## [1] 0
sum(chemo != chemo_self_mra.x)
## [1] 0
# Unbalanced cases and controls in treated/non-treated by cytotoxics
ggplot(pheno.df) + 
  aes(as.factor(chemo), fill=as.factor(cc.x)) +
  geom_bar(colour="black", alpha=0.3)

# There was more chemo in controls
table(cc.x, chemo)
##     chemo
## cc.x   0   1
##    0 108 139
##    1 136 110
fisher.test(table(cc.x, chemo))
## 
##  Fisher's Exact Test for Count Data
## 
## data:  table(cc.x, chemo)
## p-value = 0.01167
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.4334897 0.9108517
## sample estimates:
## odds ratio 
##  0.6290542
ggplot(pheno.df) + 
  aes(as.factor(cc.x), fill=as.factor(chemo)) +
  geom_bar(colour="black", alpha=0.3)

x_y_x_ray_treatment

# No missed data or discrepansies in x-ray treatment
# (when compare XRTBrCHAR, XRTBreast.x and XRTBreast.y)
XRTBrCHAR[1:5]
## [1] 0 0 0 1 0
sum(XRTBreast.x != XRTBreast.y)
## [1] 0
sum(XRTBrCHAR != XRTBreast.x)
## [1] 0
# Unbalanced cases and controls in treated/non-treated by radiotherapy
ggplot(pheno.df) + 
  aes(as.factor(XRTBrCHAR), fill=as.factor(cc.x)) +
  geom_bar(colour="black", alpha=0.3)

# There was more x_ray_treatment in controls
table(cc.x, XRTBrCHAR)
##     XRTBrCHAR
## cc.x   0   1
##    0 103 144
##    1 136 110
fisher.test(table(cc.x, XRTBrCHAR))
## 
##  Fisher's Exact Test for Count Data
## 
## data:  table(cc.x, XRTBrCHAR)
## p-value = 0.002933
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.3985810 0.8394647
## sample estimates:
## odds ratio 
##  0.5791743
ggplot(pheno.df) + 
  aes(as.factor(cc.x), fill=as.factor(XRTBrCHAR)) +
  geom_bar(colour="black", alpha=0.3)

x_y_eigenvectors

Ethnisity: todo PCI co-plotting with known populations Recalculate eigenvectors?

# Eigenvectors - duplicated
sum(Eigen_1.x != Eigen_1.y)
## [1] 0
sum(Eigen_2.x != Eigen_2.y)
## [1] 0
sum(Eigen_3.x != Eigen_3.y)
## [1] 0
sum(Eigen_4.x != Eigen_4.y)
## [1] 0
sum(Eigen_5.x != Eigen_5.y)
## [1] 0

x_y_mixed_duplicated_fields

# setno (unknown field)
setno.x[1:5]
## [1] 382125 204356 360683 226881 357431
sum(setno.x != setno.y)
## [1] 0
t.test(setno.x~cc.x)
## 
##  Welch Two Sample t-test
## 
## data:  setno.x by cc.x
## t = -0.077993, df = 491, p-value = 0.9379
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -10863.19  10033.68
## sample estimates:
## mean in group 0 mean in group 1 
##        304727.1        305141.9
ggplot(pheno.df) +
  aes(setno.x, fill=as.factor(cc.x)) +
  geom_density(colour="black", alpha=0.3)

# Registry
registry.x[1:5]
## [1] de IR IR SE SE
## Levels:  de IA IR LA ne SE
sum(as.vector(registry.x) != as.vector(registry.y))
## [1] 0
fisher.test(table(cc.x,registry.x))
## 
##  Fisher's Exact Test for Count Data
## 
## data:  table(cc.x, registry.x)
## p-value = 1
## alternative hypothesis: two.sided
ggplot(pheno.df) +
  aes(registry.x, fill=as.factor(cc.x)) +
  geom_bar(colour="black", alpha=0.3)

# Good location (unknown field)
good_location.x[1:5]
## [1] 1 1 0 1 1
sum(good_location.x != good_location.y)
## [1] 0
fisher.test(table(cc.x,good_location.x))
## 
##  Fisher's Exact Test for Count Data
## 
## data:  table(cc.x, good_location.x)
## p-value = 1
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.6049165 1.6370561
## sample estimates:
## odds ratio 
##  0.9951316
ggplot(pheno.df) +
  aes(good_location.x, fill=as.factor(cc.x)) +
  geom_bar(colour="black", alpha=0.3)

# Status is equal to cc
status.x[1:5]
## [1] 0 0 0 0 0
sum(status.x != status.y)
## [1] 0
table(cc.x, status.x)
##     status.x
## cc.x   0   1
##    0 247   0
##    1   0 246
sum(status2.x != status2.y)
## [1] 0
sum(status.x != status2.y)
## [1] 0
# Race
sum(race.x != race.y)
## [1] 0
sum(sub_race != race.y)
## [1] 0
sum(sub_race)
## [1] 0

x_ray_treatment

# dose
dose[1:5]
## [1] no dose no dose no dose ls 1Gy  no dose
## Levels: ge 1Gy ls 1Gy no dose
class(dose)
## [1] "factor"
summary(dose)
##  ge 1Gy  ls 1Gy no dose 
##     107     150     236
ggplot(pheno.df) + 
  aes(as.factor(dose), fill=as.factor(cc.x)) +
  geom_bar(colour="black", alpha=0.3)

# More x-ray treatment in controls
fisher.test(table(cc.x, dose))
## 
##  Fisher's Exact Test for Count Data
## 
## data:  table(cc.x, dose)
## p-value = 0.02258
## alternative hypothesis: two.sided
# Balanced doses in x-ray treated cases and controls
pheno.df %>% 
  select(cc.x,dose) %>% 
  filter(dose %in% c("ge 1Gy","ls 1Gy")) %>% 
  table %>% 
  fisher.test
## 
##  Fisher's Exact Test for Count Data
## 
## data:  .
## p-value = 0.8002
## alternative hypothesis: two.sided
# No missed data in dose
sum(is.na(dose))
## [1] 0
# dose_caseloc
dose_caseloc[1:5]
## [1] 0.00 0.00 0.00 0.96   NA
class(dose_caseloc)
## [1] "numeric"
ggplot(pheno.df) +
  aes(dose_caseloc, fill=as.factor(cc.x)) +
  geom_histogram(colour="black", alpha=0.3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing non-finite values (stat_bin).

ggplot(pheno.df) +
  aes(dose_caseloc, fill=as.factor(cc.x)) +
  geom_density(colour="black", alpha=0.3)
## Warning: Removed 3 rows containing non-finite values (stat_density).

sum(dose_caseloc == 0, na.rm=TRUE)
## [1] 233
sum(dose_caseloc != 0 & dose_caseloc < 1, na.rm=TRUE)
## [1] 150
sum(dose_caseloc >=1, na.rm=TRUE)
## [1] 107
sum(is.na(dose_caseloc))
## [1] 3
# Numerically coded dose (0,1,2)
dose_num[1:5]
## [1]  0  0  0  1 NA
class(dose_num)
## [1] "numeric"
ggplot(pheno.df) +
  aes(dose_num, fill=as.factor(cc.x)) +
  geom_bar(colour="black", alpha=0.3)
## Warning: Removed 3 rows containing non-finite values (stat_count).

table(dose_num)
## dose_num
##   0   1   2 
## 233 150 107
sum(is.na(dose_num))
## [1] 3
# Disbalance btw cases and controls (higher doses in controls?)
table(cc.x,dose_num)
##     dose_num
## cc.x   0   1   2
##    0 101  83  61
##    1 132  67  46
fisher.test(table(cc.x,dose_num))
## 
##  Fisher's Exact Test for Count Data
## 
## data:  table(cc.x, dose_num)
## p-value = 0.01868
## alternative hypothesis: two.sided
ggplot(pheno.df) +
  aes(as.factor(cc.x), fill=as.factor(dose_num)) +
  geom_bar(colour="black", alpha=0.3)

# Avedose (unknown dose)
avedose[1:5]
## [1] 0.00 0.00 0.00 1.65   NA
class(avedose)
## [1] "numeric"
ggplot(pheno.df) +
  aes(avedose, fill=as.factor(cc.x)) +
  geom_histogram(colour="black", alpha=0.3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing non-finite values (stat_bin).

ggplot(pheno.df) +
  aes(avedose, fill=as.factor(cc.x)) +
  geom_density(colour="black", alpha=0.3)
## Warning: Removed 3 rows containing non-finite values (stat_density).

# Disbalance between cases and controls
t.test(avedose~cc.x)
## 
##  Welch Two Sample t-test
## 
## data:  avedose by cc.x
## t = 3.0094, df = 480.24, p-value = 0.002755
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.07246196 0.34508906
## sample estimates:
## mean in group 0 mean in group 1 
##       0.7741633       0.5653878
# More balanced amongst treated?
pheno.df %>% filter(avedose!=0) %>% 
  t.test(data=.,avedose~cc.x)
## 
##  Welch Two Sample t-test
## 
## data:  avedose by cc.x
## t = 1.224, df = 252.96, p-value = 0.2221
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.05560374  0.23822788
## sample estimates:
## mean in group 0 mean in group 1 
##        1.317153        1.225841

hormonal_treatment

# Inconsistent data on endocrine treatment
# Apparently 9 patients did not received hormones, 
# while receiving tamoxifen??
table(hormone,tmx)
##        tmx
## hormone   0   1
##       0 375   9
##       1  23  85
# horm_tmx is different from tmx
# Apparently 0 is no hormonal treatment
# What is coded as 1 and 2 ??
horm_tmx[1:5]
## [1] 0 1 1 0 2
ggplot(pheno.df) +
  aes(as.factor(horm_tmx), fill=as.factor(cc.x)) +
  geom_bar(colour="black", alpha=0.3)

table(horm_tmx)
## horm_tmx
##   0   1   2 
## 375  94  24
sum(is.na(horm_tmx))
## [1] 0
# Tmx
tmx[1:5]
## [1] 0 1 1 0 0
barplot(table(tmx))

table(tmx)
## tmx
##   0   1 
## 399  94
sum(is.na(tmx))
## [1] 0
# Treatment by age: hormones more often are given to older patients
x <- !is.na(hormone)
ggplot(pheno.df[x,]) + 
  aes(x = sub_dx_age.x, fill = as.factor(hormone)) +
  geom_density(colour="black", alpha=0.3)

rm(x)

t.test(sub_dx_age.x~hormone)
## 
##  Welch Two Sample t-test
## 
## data:  sub_dx_age.x by hormone
## t = -2.7477, df = 199.2, p-value = 0.006552
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.326010 -0.382323
## sample estimates:
## mean in group 0 mean in group 1 
##        41.97917        43.33333

cytotoxic_treatment

# Reminder: chemo is identical to chemo.x/y
chemo[1:5]
## [1] 1 1 0 0 1
ggplot(pheno.df) +
  aes(as.factor(chemo), fill=as.factor(cc.x)) +
  geom_bar(colour="black", alpha=0.3)

table(chemo)
## chemo
##   0   1 
## 244 249
sum(is.na(chemo))
## [1] 0
# Treatment by age: cytotoxic are more often given to younger patients
ggplot(pheno.df) + 
  aes(x = sub_dx_age.x, fill = as.factor(chemo)) +
  geom_density(colour="black", alpha=0.3)

t.test(sub_dx_age.x~chemo)
## 
##  Welch Two Sample t-test
## 
## data:  sub_dx_age.x by chemo
## t = 6.0245, df = 461.28, p-value = 3.47e-09
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  1.761803 3.467540
## sample estimates:
## mean in group 0 mean in group 1 
##        43.61066        40.99598
# cmf uses wrong encodings
cmf[1:5]
## [1] CMF Oth no  no  CMF
## Levels:  \xb0\x9b@ \024\x9c@ CMF no Oth
ggplot(pheno.df) +
  aes(as.factor(cmf), fill=as.factor(cc.x)) +
  geom_bar(colour="black", alpha=0.3)

table(cmf)
## cmf
##           \xb0\x9b@ \024\x9c@       CMF        no       Oth 
##         0         0         0       153       244        96
sum(is.na(cmf))
## [1] 0
# cmf012 is numeric cmf
# 0= none, 1=cmf, 2=other (the order is not meaningful)
cmf_012[1:5]
## [1] 1 2 0 0 1
ggplot(pheno.df) +
  aes(as.factor(cmf_012), fill=as.factor(cc.x)) +
  geom_bar(colour="black", alpha=0.3)

table(cmf_012)
## cmf_012
##   0   1   2 
## 244 153  96
sum(is.na(cmf_012))
## [1] 0
# CMF is coded better than cmf
CMF[1:5]
## [1] CMF Oth no  no  Oth
## Levels: CMF no Oth
ggplot(pheno.df) +
  aes(as.factor(CMF), fill=as.factor(cc.x)) +
  geom_bar(colour="black", alpha=0.3)

table(CMF)
## CMF
## CMF  no Oth 
## 131 244 118
sum(is.na(CMF))
## [1] 0
# CMF vs age
ggplot(pheno.df) + 
  aes(x = sub_dx_age.x, fill = as.factor(CMF)) +
  geom_density(colour="black", alpha=0.3)

# CMF is different from cmf
sum(as.vector(cmf) != as.vector(CMF))
## [1] 22
# CMF only
CMF_Only[1:5]
## [1] 1 0 0 0 0
ggplot(pheno.df) +
  aes(as.factor(CMF_Only), fill=as.factor(cc.x)) +
  geom_bar(colour="black", alpha=0.3)

table(CMF_Only)
## CMF_Only
##   0   1 
## 362 131
sum(is.na(CMF_Only))
## [1] 0
# # Treatment by age: patients given cytotoxics are younger than those given hormones
ggplot() + 
  geom_density(data=pheno.df[chemo==1,], aes(sub_dx_age.x), colour="black", fill="red", alpha=0.25) +
  geom_density(data=pheno.df[hormone==1,], aes(sub_dx_age.x), colour="black", fill="blue", alpha=0.25)
## Warning: Removed 1 rows containing non-finite values (stat_density).

t.test(sub_dx_age.x[chemo==1],sub_dx_age.x[hormone==1])
## 
##  Welch Two Sample t-test
## 
## data:  sub_dx_age.x[chemo == 1] and sub_dx_age.x[hormone == 1]
## t = -4.3157, df = 252.3, p-value = 2.286e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -3.403967 -1.270731
## sample estimates:
## mean of x mean of y 
##  40.99598  43.33333

pathology

# Stage
stage_fd[1:5]
## [1] 2 1 1 1 2
summary(as.factor(stage_fd))
##   1   2 
## 344 149
ggplot(pheno.df) + 
  aes(as.factor(stage_fd), fill=as.factor(cc.x)) + 
  geom_bar(colour="black", alpha=0.3)

# No significant differences by stage between cases and controls
# However, tendency to higher stage in controls, 
# which might had caused more cytotoxics and x-ray in controls
fisher.test(table(cc.x, stage_fd))
## 
##  Fisher's Exact Test for Count Data
## 
## data:  table(cc.x, stage_fd)
## p-value = 0.07746
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.4641429 1.0446625
## sample estimates:
## odds ratio 
##  0.6973788
ggplot(pheno.df) + 
  aes(as.factor(cc.x), fill=as.factor(stage_fd)) + 
  geom_bar(colour="black", alpha=0.3)

# Histology
# Does not make sense to pool nst and tubular? 
Histology[1:5]
## [1] other    other    other    medullar other   
## Levels: lobular medullar other r   othe
unique(as.vector(Histology))
## [1] "other"    "medullar" "lobular"
ggplot(pheno.df) +
  aes(Histology, fill=Histology) + 
  geom_bar(colour="black", alpha=0.3)

table(Histology)
## Histology
##  lobular medullar    other r   othe 
##       50       13      430        0
# Histology1 - swop between lobular and other 
Histology1[1:5]
## [1] other    other    other    medullar other   
## Levels: lobular medullar other r   othe
unique(as.vector(Histology1))
## [1] "other"    "medullar" "lobular"
ggplot(pheno.df) +
  aes(Histology1, fill=Histology1) + 
  geom_bar(colour="black", alpha=0.3)

table(Histology1)
## Histology1
##  lobular medullar    other r   othe 
##       36       13      444        0
sum(as.vector(Histology) != as.vector(Histology1))
## [1] 14
pheno.df %>% 
  select(labid.x, Histology, Histology1) %>% 
  filter(as.vector(Histology) != as.vector(Histology1))
##     labid.x Histology Histology1
## 1  id203568   lobular      other
## 2  id216437   lobular      other
## 3  id246347   lobular      other
## 4  id260716   lobular      other
## 5  id270215   lobular      other
## 6  id275762   lobular      other
## 7  id278053   lobular      other
## 8  id298289   lobular      other
## 9  id301831   lobular      other
## 10 id310684   lobular      other
## 11 id320617   lobular      other
## 12 id324813   lobular      other
## 13 id337867   lobular      other
## 14 id371301   lobular      other
#pheno.df[
#  as.vector(Histology) != as.vector(Histology1), 
#  c("labid.x","Histology","Histology1")]

# histol_cat
histo1_cat[1:5]
## [1] unknown   ductal    ductal    medullary ductal   
## 9 Levels: al          posi al          unkn ductal lobular ... unknown
unique(as.vector(histo1_cat))
## [1] "unknown"          "ductal"           "medullary"       
## [4] "lobular"          "Tubular/mucinous" "other carcinoma"
# Cases and controls are balanced by histology
ggplot(pheno.df) +
  aes(histo1_cat, fill=as.factor(cc.x)) + 
  geom_bar(colour="black", alpha=0.3)

table(cc.x, histo1_cat)
##     histo1_cat
## cc.x al          posi al          unkn ductal lobular medullary
##    0                0                0    205      19         6
##    1                0                0    190      31         7
##     histo1_cat
## cc.x other carcinoma r carcinoma posi Tubular/mucinous unknown
##    0               8                0                8       1
##    1              10                0                8       0
fisher.test(table(cc.x, histo1_cat))
## 
##  Fisher's Exact Test for Count Data
## 
## data:  table(cc.x, histo1_cat)
## p-value = 0.4383
## alternative hypothesis: two.sided
# Balance within ductal and lobular: 
# non-significant trend to have more lobular in CBC
pheno.df %>% 
  filter(histo1_cat %in% c("ductal", "lobular")) %>% 
  ggplot() + 
    aes(histo1_cat, fill=as.factor(cc.x)) + 
    geom_bar(colour="black", alpha=0.3)

x <- table(
  as.vector(cc.x[histo1_cat == "ductal" | histo1_cat == "lobular"]), 
  as.vector(histo1_cat[histo1_cat == "ductal" | histo1_cat == "lobular"]))
x
##    
##     ductal lobular
##   0    205      19
##   1    190      31
fisher.test(x)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  x
## p-value = 0.07223
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.9266881 3.4130751
## sample estimates:
## odds ratio 
##   1.758151
rm(x)

# Lobular only
lobular[1:5]
## [1] 0 0 0 0 0
unique(as.vector(lobular))
## [1] 0 1
ggplot(pheno.df) +
  aes(as.factor(lobular), fill=as.factor(cc.x)) +
  geom_bar(colour="black", alpha=0.3)

table(lobular)
## lobular
##   0   1 
## 443  50
# Hist_lob_other
Hist_lob_other[1:5]
## [1] other other other other other
## Levels: lobular other r  duct r  othe
unique(as.vector(Hist_lob_other))
## [1] "other"   "lobular"
ggplot(pheno.df) +
  aes(Hist_lob_other, fill=as.factor(cc.x)) +
  geom_bar(colour="black", alpha=0.3)

table(Hist_lob_other)
## Hist_lob_other
## lobular   other r  duct r  othe 
##      50     443       0       0

receptors

check consistency with treatment? much more more er+ ve? many er unknown

# er1
er1[1:5]
## [1]  1  1  1 NA  1
class(er1)
## [1] "integer"
unique(as.vector(er1))
## [1]  1 NA  0
ggplot(pheno.df) +
  aes(as.factor(er1), fill=as.factor(cc.x)) +
  geom_bar(colour="black", alpha=0.3)

summary(as.factor(er1))
##    0    1 NA's 
##  112  231  150
# Balanced between cases and controls
table(cc.x, er1)
##     er1
## cc.x   0   1
##    0  63 114
##    1  49 117
fisher.test(table(cc.x,er1))
## 
##  Fisher's Exact Test for Count Data
## 
## data:  table(cc.x, er1)
## p-value = 0.2504
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.8178203 2.1335103
## sample estimates:
## odds ratio 
##    1.31847
ggplot(pheno.df) +
  aes(as.factor(cc.x), fill=as.factor(er1)) +
  geom_bar(colour="black", alpha=0.3)

# er1_num (=er1)
er1_num[1:5]
## [1]  1  1  1 NA  1
class(er1_num)
## [1] "numeric"
unique(as.vector(er1_num))
## [1]  1 NA  0
ggplot(pheno.df) +
  aes(as.factor(er1_num), fill=as.factor(cc.x)) +
  geom_bar(colour="black", alpha=0.3)

table(as.factor(er1_num))
## 
##   0   1 
## 112 231
sum(is.na(er1_num))
## [1] 150
ggplot(pheno.df) +
  aes(as.factor(cc.x), fill=as.factor(er1_num)) +
  geom_bar(colour="black", alpha=0.3)

sum(as.vector(er1) != as.vector(er1), na.rm = TRUE)
## [1] 0
# er1_cat
er1_cat[1:5]
## [1] positive positive positive unknown  positive
## Levels: negative own unkn positive tiveposi unknown
class(er1_cat)
## [1] "factor"
unique(as.vector(er1_cat))
## [1] "positive" "unknown"  "negative"
ggplot(pheno.df) +
  aes(as.factor(er1_cat), fill=as.factor(cc.x)) +
  geom_bar(colour="black", alpha=0.3)

table(as.vector(er1_cat))
## 
## negative positive  unknown 
##      113      231      149
sum(is.na(er1_cat))
## [1] 0
# Balanced between cases and controls
x <- table(cc.x, er1_cat)[,c(1,3,5)]
x
##     er1_cat
## cc.x negative positive unknown
##    0       64      114      69
##    1       49      117      80
fisher.test(x)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  x
## p-value = 0.2429
## alternative hypothesis: two.sided
rm(x)

# er_fd
er_fd[1:5]
## [1] 1 1 1 4 1
class(er_fd)
## [1] "numeric"
unique(as.vector(er_fd))
## [1] 1 4 2 0 9 3
ggplot(pheno.df) +
  aes(as.factor(er_fd), fill=as.factor(cc.x)) +
  geom_bar(colour="black", alpha=0.3)

table(as.vector(er_fd))
## 
##   0   1   2   3   4   9 
## 103 231 112  17   4  26
sum(is.na(er_fd))
## [1] 0
# pr1_cat
pr1_cat[1:5]
## [1] positive positive positive negative positive
## 7 Levels: negative own 0 Ot positive tive01Ot tive11no ... unknown
class(pr1_cat)
## [1] "factor"
unique(as.vector(pr1_cat))
## [1] "positive" "negative" "unknown"
ggplot(pheno.df) +
  aes(as.factor(pr1_cat), fill=as.factor(cc.x)) +
  geom_bar(colour="black", alpha=0.3)

table(as.vector(pr1_cat))
## 
## negative positive  unknown 
##       95      188      210
sum(is.na(pr1_cat))
## [1] 0
# Balanced between cases and controls
x <- table(cc.x, pr1_cat)[,c(1,3,7)]
fisher.test(x)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  x
## p-value = 0.4529
## alternative hypothesis: two.sided
rm(x)

ggplot(pheno.df) +
  aes(as.factor(cc.x), fill=as.factor(pr1_cat)) +
  geom_bar(colour="black", alpha=0.3)

# pr correlate with er
table(er1, pr1_cat)[,c(1,3)]
##    pr1_cat
## er1 negative positive
##   0       66       23
##   1       26      163
fisher.test(table(er1, pr1_cat)[,c(1,3)])
## 
##  Fisher's Exact Test for Count Data
## 
## data:  table(er1, pr1_cat)[, c(1, 3)]
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##   9.173234 35.515099
## sample estimates:
## odds ratio 
##   17.71474
table(er1, pr1_cat)[,c(1,3,7)]
##    pr1_cat
## er1 negative positive unknown
##   0       66       23      23
##   1       26      163      42
fisher.test(table(er1, pr1_cat)[,c(1,3,7)])
## 
##  Fisher's Exact Test for Count Data
## 
## data:  table(er1, pr1_cat)[, c(1, 3, 7)]
## p-value < 2.2e-16
## alternative hypothesis: two.sided
# Combined er and pr field
pheno.df <- pheno.df %>% mutate(epr=paste(er1_cat, pr1_cat))

# Look at combined faceted plots
ggplot(pheno.df) +
  aes( as.factor(cc.x), fill=as.factor(epr)) +
  geom_bar(colour="black", alpha=0.3)

ggplot(pheno.df) +
  aes( as.factor(cc.x), fill=as.factor(epr)) +
  geom_bar(colour="black", alpha=0.3) + 
  facet_grid(er1~pr1_cat)

# Excess of controls in ER-ve, PgR-ve
# Excess of controls in ER-ve, PgR+ve??

# Balanced cases and controls in ER+ve, PgR-ve
# Balanced cases and controls in ER+ve, PgR+ve

# ===================================================== #

# pr_fd
pr_fd[1:5]
## [1] 1 1 1 2 1
class(pr_fd)
## [1] "numeric"
unique(as.vector(pr_fd))
## [1] 1 2 0 9 3 4
ggplot(pheno.df) +
  aes(as.factor(pr_fd), fill=as.factor(cc.x)) +
  geom_bar(colour="black", alpha=0.3)

table(as.vector(pr_fd))
## 
##   0   1   2   3   4   9 
## 157 188  94  16   5  33
fisher.test(table(cc.x,pr_fd))
## 
##  Fisher's Exact Test for Count Data
## 
## data:  table(cc.x, pr_fd)
## p-value = 0.6308
## alternative hypothesis: two.sided
sum(is.na(pr_fd))
## [1] 0

reproductive_factors

# age_menarche (categorical)
age_menarche[1:5]
## [1] 1 1 1 1 0
class(age_menarche)
## [1] "integer"
unique(age_menarche)
## [1] 1 0
ggplot(pheno.df) +
  aes(as.factor(age_menarche), fill=as.factor(cc.x)) +
  geom_bar(colour="black", alpha=0.3)

table(age_menarche)
## age_menarche
##   0   1 
## 228 265
sum(is.na(age_menarche))
## [1] 0
# Balanced between cases and controls
table(cc.x, age_menarche)
##     age_menarche
## cc.x   0   1
##    0 109 138
##    1 119 127
fisher.test(table(cc.x,age_menarche))
## 
##  Fisher's Exact Test for Count Data
## 
## data:  table(cc.x, age_menarche)
## p-value = 0.3669
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.5821688 1.2204481
## sample estimates:
## odds ratio 
##   0.843251
sum(is.na(age_menarche))
## [1] 0
ggplot(pheno.df) +
  aes(as.factor(cc.x), fill=as.factor(age_menarche)) +
  geom_bar(colour="black", alpha=0.3)

# Age menarche (numeric)
rh_age_menarche[1:5]
## [1] 13 14 13 13 12
class(rh_age_menarche)
## [1] "numeric"
unique(rh_age_menarche)
##  [1] 13 14 12  9 11 10 15 18 16 17  0 19
hist(rh_age_menarche)

ggplot(pheno.df) + 
  aes(rh_age_menarche, fill=as.factor(cc.x)) +
  geom_histogram(colour="black", alpha=0.3, binwidth=1) +
  stat_bin(binwidth=1, geom="text", aes(label=..count..), vjust=-1)

ggplot(pheno.df) + 
  aes(rh_age_menarche, fill=as.factor(cc.x)) +
  geom_histogram(colour="black", alpha=0.3, binwidth=1)

table(cc.x, rh_age_menarche)
##     rh_age_menarche
## cc.x  0  9 10 11 12 13 14 15 16 17 18 19
##    0  2  7  7 30 63 72 38 16  8  3  1  0
##    1  2  5 14 28 70 69 29 19  5  3  1  1
sum(is.na(rh_age_menarche))
## [1] 0
# Balanced between cases and controls
t.test(rh_age_menarche~cc.x)
## 
##  Welch Two Sample t-test
## 
## data:  rh_age_menarche by cc.x
## t = 0.51979, df = 490.82, p-value = 0.6034
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.2525971  0.4343225
## sample estimates:
## mean in group 0 mean in group 1 
##        12.64777        12.55691
# odd density plot behaving ...
ggplot(pheno.df) + 
  aes(rh_age_menarche, fill=as.factor(cc.x)) +
  geom_density(colour="black", alpha=0.3)

# adjusted plot
ggplot(pheno.df) + 
  aes(rh_age_menarche, fill=as.factor(cc.x)) +
  geom_density(colour="black", alpha=0.3, adjust=3)

# age_menopause: 0,1,2 (pre, in, post?)

age_menopause[1:5]
## [1] 0 2 0 0 0
class(age_menopause)
## [1] "integer"
unique(age_menopause)
## [1] 0 2 1
ggplot(pheno.df) +
  aes(as.factor(age_menopause), fill=as.factor(cc.x)) +
  geom_bar(colour="black", alpha=0.3)

table(age_menopause)
## age_menopause
##   0   1   2 
## 400  72  21
sum(is.na(age_menopause))
## [1] 0
# Balanced cases and controls
table(cc.x, age_menopause)
##     age_menopause
## cc.x   0   1   2
##    0 201  37   9
##    1 199  35  12
fisher.test(table(cc.x, age_menopause))
## 
##  Fisher's Exact Test for Count Data
## 
## data:  table(cc.x, age_menopause)
## p-value = 0.793
## alternative hypothesis: two.sided
ggplot(pheno.df) +
  aes(as.factor(cc.x), fill=as.factor(age_menopause)) +
  geom_bar(colour="black", alpha=0.3)

# age_menopause_1yrbf_fd: what is this? 
# Does not look as menopausal age...

age_menopause_1yrbf_fd[1:5]
## [1] -1 48 -1 -1 -1
class(age_menopause_1yrbf_fd)
## [1] "numeric"
unique(age_menopause_1yrbf_fd)
##  [1] -1 48 49 29 33 46 31 41 43 25 45 39 40 36 35 28 27 38 NA 42 47 34 32
## [24] 44 52 26 37
ggplot(pheno.df) +
  aes(age_menopause_1yrbf_fd, fill=as.factor(cc.x)) +
  geom_histogram(colour="black", alpha=0.3, binwidth=1)
## Warning: Removed 3 rows containing non-finite values (stat_bin).

table(age_menopause_1yrbf_fd)
## age_menopause_1yrbf_fd
##  -1  25  26  27  28  29  31  32  33  34  35  36  37  38  39  40  41  42 
## 397   1   1   2   2   5   3   2   4   3   5   5   3   3   1   6   7   8 
##  43  44  45  46  47  48  49  52 
##   7   4   6   3   5   4   2   1
sum(is.na(age_menopause_1yrbf_fd))
## [1] 3
# Balanced between cases and controls
t.test(age_menopause_1yrbf_fd~cc.x)
## 
##  Welch Two Sample t-test
## 
## data:  age_menopause_1yrbf_fd by cc.x
## t = -0.37457, df = 486.79, p-value = 0.7081
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -3.384364  2.300597
## sample estimates:
## mean in group 0 mean in group 1 
##        6.348361        6.890244
pheno.df %>% 
  filter(age_menopause_1yrbf_fd != -1) %>% 
  t.test(data=., age_menopause_1yrbf_fd~cc.x)
## 
##  Welch Two Sample t-test
## 
## data:  age_menopause_1yrbf_fd by cc.x
## t = -1.7734, df = 89.443, p-value = 0.07957
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -4.918452  0.279229
## sample estimates:
## mean in group 0 mean in group 1 
##        37.97826        40.29787
pheno.df %>% 
  filter(age_menopause_1yrbf_fd != -1) %>% 
  ggplot() +
    aes(age_menopause_1yrbf_fd, fill=as.factor(cc.x)) +
    geom_histogram(colour="black", alpha=0.3, binwidth=1)

pheno.df %>% 
  filter(age_menopause_1yrbf_fd != -1) %>% 
  ggplot() +
    aes(age_menopause_1yrbf_fd, fill=as.factor(cc.x)) +
    geom_density(colour="black", alpha=0.3)

# Of those who is in mp there is trent to have younger mp age in controls
# This may be explained by more cytotoxic treatment in this group

# age_1fftp_fd: what is this? another age of a childbearing?
# ftp - in my prev life meant "full term pregnancy"?
# Less pregnancies in cases than in controls?

age_1fftp_fd[1:5]
## [1] 20 22  0 29 28
class(age_1fftp_fd)
## [1] "numeric"
unique(age_1fftp_fd)
##  [1] 20 22  0 29 28 27 24 30 21 23 25 26 38 19 40 17 31 18 39 37 32 33 16
## [24] 36 35 34 15
ggplot(pheno.df) + 
  aes(age_1fftp_fd, fill=as.factor(cc.x)) +
  geom_histogram(colour="black", alpha=0.3, binwidth=1)

ggplot(pheno.df) + 
  aes(age_1fftp_fd, fill=as.factor(cc.x)) +
  geom_density(colour="black", alpha=0.3)

pheno.df %>% filter(age_1fftp_fd!=0) %>% 
  ggplot() + 
  aes(age_1fftp_fd, fill=as.factor(cc.x)) +
  geom_density(colour="black", alpha=0.3)

pheno.df %>% filter(age_1fftp_fd!=0) %>% 
  t.test(data=.,age_1fftp_fd~cc.x)
## 
##  Welch Two Sample t-test
## 
## data:  age_1fftp_fd by cc.x
## t = -0.88651, df = 393.25, p-value = 0.3759
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.4164195  0.5360263
## sample estimates:
## mean in group 0 mean in group 1 
##        24.85514        25.29534
table(age_1fftp_fd)
## age_1fftp_fd
##  0 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 
## 86  1  3  7 10 26 33 29 32 38 28 35 22 20 29 16 19 16 12  6  5  4  4  4  2 
## 39 40 
##  3  3
sum(is.na(age_1fftp_fd))
## [1] 0
# Balanced between cases and controls
t.test(age_1fftp_fd~cc.x)
## 
##  Welch Two Sample t-test
## 
## data:  age_1fftp_fd by cc.x
## t = 1.7818, df = 476.59, p-value = 0.07541
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1735768  3.5513458
## sample estimates:
## mean in group 0 mean in group 1 
##        21.53441        19.84553
pheno.df %>% 
  filter(age_1fftp_fd != 0) %>% 
  t.test(data=., age_1fftp_fd~cc.x)
## 
##  Welch Two Sample t-test
## 
## data:  age_1fftp_fd by cc.x
## t = -0.88651, df = 393.25, p-value = 0.3759
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.4164195  0.5360263
## sample estimates:
## mean in group 0 mean in group 1 
##        24.85514        25.29534
# Num_ftp_fd - what would it be?
# Num of full time pregnancies???
# The numnbers do not make sence and do not match the num_preg

Num_ftp_fd[1:5]
## [1]  1  2 -1  2  2
class(Num_ftp_fd)
## [1] "numeric"
unique(Num_ftp_fd)
## [1]  1  2 -1  3  4  0  5  6  7
ggplot(pheno.df) +
  aes(as.factor(Num_ftp_fd), fill=as.factor(cc.x)) +
  geom_bar(colour="black", alpha=0.3)

ggplot(pheno.df) +
  aes(as.factor(cc.x), fill=as.factor(Num_ftp_fd)) +
  geom_bar(colour="black", alpha=0.3)

table(cc.x, Num_ftp_fd)
##     Num_ftp_fd
## cc.x  -1   0   1   2   3   4   5   6   7
##    0  23  10  38 106  45  19   4   1   1
##    1  31  22  45 104  32  10   2   0   0
sum(is.na(Num_ftp_fd))
## [1] 0
# Balanced in cases and controls
#fisher.test(table(cc.x, Num_ftp_fd)) # can not calculate ...
fisher.test(table(cc.x, Num_ftp_fd)[,1:5]) # non-sign trend
## 
##  Fisher's Exact Test for Count Data
## 
## data:  table(cc.x, Num_ftp_fd)[, 1:5]
## p-value = 0.08616
## alternative hypothesis: two.sided
# num_preg

num_preg[1:5]
## [1] 1 1 0 1 1
class(num_preg)
## [1] "numeric"
unique(num_preg)
## [1] 1 0 2
ggplot(pheno.df) +
  aes(as.factor(num_preg), fill=as.factor(cc.x)) +
  geom_bar(colour="black", alpha=0.3)

sum(is.na(num_preg))
## [1] 0
# Less pregnancies in cases
table(cc.x, num_preg)
##     num_preg
## cc.x   0   1   2
##    0  33 189  25
##    1  53 181  12
sum(is.na(num_preg))
## [1] 0
fisher.test(table(cc.x, num_preg))
## 
##  Fisher's Exact Test for Count Data
## 
## data:  table(cc.x, num_preg)
## p-value = 0.009058
## alternative hypothesis: two.sided
ggplot(pheno.df) +
  aes(as.factor(cc.x), fill=as.factor(num_preg)) +
  geom_bar(colour="black", alpha=0.3)

# No CBC with 3-pregnancies ...

BMI

this is about lafelong exposure to Eg

in pre-menopause it is protective this is about endocrine disorders and disruption of ms-cyclingand about cumulative lafelong exposure to Eg too speculative …

# BMI_age18

BMI_age18[1:5]
## [1] 20.20202 19.73983 23.29737 19.46995 25.84315
class(BMI_age18)
## [1] "numeric"
ggplot(pheno.df) + 
  aes(BMI_age18, fill=as.factor(cc.x)) +
  geom_histogram(colour="black", alpha=0.3, binwidth=1)
## Warning: Removed 2 rows containing non-finite values (stat_bin).

ggplot(pheno.df) + 
  aes(BMI_age18, fill=as.factor(cc.x)) +
  geom_density(colour="black", alpha=0.3)
## Warning: Removed 2 rows containing non-finite values (stat_density).

# controls tended to have slightly higher BMI at age of 18
# Caused by obese cases, consistent with protective role of
# obesity in pre-menopause
t.test(BMI_age18~cc.x)
## 
##  Welch Two Sample t-test
## 
## data:  BMI_age18 by cc.x
## t = 2.1121, df = 470.08, p-value = 0.03521
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.03452471 0.95732461
## sample estimates:
## mean in group 0 mean in group 1 
##        20.64919        20.15327
summary(BMI_age18)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   13.56   18.79   20.08   20.40   21.63   34.96       2
sum(is.na(BMI_age18))
## [1] 2
# BMI_dx

BMI_dx[1:5]
## [1] 22.77319 20.94139 23.29737 32.61632 25.84315
class(BMI_dx)
## [1] "numeric"
ggplot(pheno.df) + 
  aes(BMI_dx, fill=as.factor(cc.x)) +
  geom_histogram(colour="black", alpha=0.3, binwidth=1)
## Warning: Removed 3 rows containing non-finite values (stat_bin).

ggplot(pheno.df) + 
  aes(BMI_dx, fill=as.factor(cc.x)) +
  geom_density(colour="black", alpha=0.3)
## Warning: Removed 3 rows containing non-finite values (stat_density).

# controls had higher BMI at Ds
t.test(BMI_dx~cc.x)
## 
##  Welch Two Sample t-test
## 
## data:  BMI_dx by cc.x
## t = 2.5725, df = 458.17, p-value = 0.01041
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.2469653 1.8452137
## sample estimates:
## mean in group 0 mean in group 1 
##        23.93155        22.88546
summary(BMI_dx)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   14.10   20.37   22.31   23.41   25.09   58.77       3
sum(is.na(BMI_dx))
## [1] 3
# BMI_age18 vs BMI_dx

ggplot(pheno.df) + 
  aes(x=BMI_age18, y=BMI_dx, colour=as.factor(cc.x)) +
  geom_point(alpha=0.3) + 
  geom_smooth(method="lm")
## Warning: Removed 5 rows containing non-finite values (stat_smooth).
## Warning: Removed 5 rows containing missing values (geom_point).

# BMI_ref

BMI_ref[1:5]
## [1] 25.71166 21.97129 25.79352 32.61632 28.05828
class(BMI_ref)
## [1] "numeric"
ggplot(pheno.df) + 
  aes(BMI_ref, fill=as.factor(cc.x)) +
  geom_histogram(colour="black", alpha=0.3, binwidth=1)
## Warning: Removed 3 rows containing non-finite values (stat_bin).

ggplot(pheno.df) + 
  aes(BMI_ref, fill=as.factor(cc.x)) +
  geom_density(colour="black", alpha=0.3)
## Warning: Removed 3 rows containing non-finite values (stat_density).

summary(BMI_ref)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   15.66   21.30   23.37   24.64   26.76   58.77       3
sum(is.na(BMI_ref))
## [1] 3
# BMI_ref vs BMI_dx
ggplot(pheno.df) + 
  aes(x=BMI_ref, y=BMI_dx, colour=as.factor(cc.x)) +
  geom_point(alpha=0.3) + 
  geom_smooth(method="lm")
## Warning: Removed 4 rows containing non-finite values (stat_smooth).
## Warning: Removed 4 rows containing missing values (geom_point).

general_data_fields

# DOB

DOB[1:5]
## [1] -5049 -7459 -3916 -5890 -6407
class(DOB)
## [1] "numeric"
hist(DOB)

ggplot(pheno.df) + 
  aes(DOB, fill=as.factor(cc.x)) +
  geom_histogram(colour="black", alpha=0.3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(pheno.df) + 
  aes(DOB, fill=as.factor(cc.x)) +
  geom_density(colour="black", alpha=0.3)

# dxyr_stratum

dxyr_stratum[1:5]
## [1] 2 2 3 1 1
class(dxyr_stratum)
## [1] "integer"
summary(dxyr_stratum)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   2.000   1.842   2.000   4.000
ggplot(pheno.df) + 
  aes(dxyr_stratum, fill=as.factor(cc.x)) +
  geom_bar(colour="black", alpha=0.3)

# Absolutely balanced between cases and controls
table(dxyr_stratum)
## dxyr_stratum
##   1   2   3   4 
## 214 161 100  18
sum(is.na(dxyr_stratum))
## [1] 0
fisher.test(table(cc.x,dxyr_stratum))
## 
##  Fisher's Exact Test for Count Data
## 
## data:  table(cc.x, dxyr_stratum)
## p-value = 1
## alternative hypothesis: two.sided
ggplot(pheno.df) + 
  aes(as.factor(cc.x), fill=as.factor(dxyr_stratum)) +
  geom_bar(colour="black", alpha=0.3)

# age_stratum1

age_stratum1[1:5]
## [1] 45to49 45to49 45to49 40to44 40to44
## Levels: 20to34 35to39 40to44 45to49 50to54
class(age_stratum1)
## [1] "factor"
summary(age_stratum1)
## 20to34 35to39 40to44 45to49 50to54 
##     40     92    218    133     10
ggplot(pheno.df) + 
  aes(age_stratum1, fill=as.factor(cc.x)) +
  geom_bar(colour="black", alpha=0.3)

table(age_stratum1)
## age_stratum1
## 20to34 35to39 40to44 45to49 50to54 
##     40     92    218    133     10
sum(is.na(age_stratum1))
## [1] 0
# Absolutely balanced between cases and controls
table(age_stratum1)
## age_stratum1
## 20to34 35to39 40to44 45to49 50to54 
##     40     92    218    133     10
sum(is.na(age_stratum1))
## [1] 0
fisher.test(table(cc.x,age_stratum1))
## 
##  Fisher's Exact Test for Count Data
## 
## data:  table(cc.x, age_stratum1)
## p-value = 1
## alternative hypothesis: two.sided
ggplot(pheno.df) + 
  aes(as.factor(cc.x), fill=as.factor(age_stratum1)) +
  geom_bar(colour="black", alpha=0.3)

# sub_dx_age_cat

sub_dx_age_cat[1:5]
## [1] 0 0 0 1 1
class(sub_dx_age_cat)
## [1] "integer"
summary(sub_dx_age_cat)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  1.0000  0.6613  1.0000  1.0000
ggplot(pheno.df) + 
  aes(sub_dx_age_cat, fill=as.factor(cc.x)) +
  geom_bar(colour="black", alpha=0.3)

table(sub_dx_age_cat)
## sub_dx_age_cat
##   0   1 
## 167 326
sum(is.na(sub_dx_age_cat))
## [1] 0
# Balanced between cases and controls
fisher.test(table(cc.x,sub_dx_age_cat))
## 
##  Fisher's Exact Test for Count Data
## 
## data:  table(cc.x, sub_dx_age_cat)
## p-value = 0.7758
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.6369474 1.3909684
## sample estimates:
## odds ratio 
##   0.941456
ggplot(pheno.df) + 
  aes(as.factor(cc.x), fill=as.factor(sub_dx_age_cat)) +
  geom_bar(colour="black", alpha=0.3)

# refage

refage[1:5]
## [1] 53 59 51 50 52
class(refage)
## [1] "numeric"
summary(refage)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   27.00   44.00   48.00   47.81   52.00   70.00
ggplot(pheno.df) + 
  aes(refage, fill=as.factor(cc.x)) +
  geom_histogram(colour="black", alpha=0.3, binwidth=1)

ggplot(pheno.df) + 
  aes(refage, fill=as.factor(cc.x)) +
  geom_density(colour="black", alpha=0.3)

summary(refage)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   27.00   44.00   48.00   47.81   52.00   70.00
sum(is.na(refage))
## [1] 0
# balanced in cases and controls
t.test(refage~cc.x)
## 
##  Welch Two Sample t-test
## 
## data:  refage by cc.x
## t = -0.038983, df = 490.73, p-value = 0.9689
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.211407  1.164273
## sample estimates:
## mean in group 0 mean in group 1 
##        47.79757        47.82114
# rstime

rstime[1:5]
## [1] 7.42 9.75 6.09 6.25 9.34
class(rstime)
## [1] "numeric"
summary(rstime)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   3.330   5.330   5.988   8.250  15.580
ggplot(pheno.df) + 
  aes(rstime, fill=as.factor(cc.x)) +
  geom_histogram(colour="black", alpha=0.3, binwidth=1)

ggplot(pheno.df) + 
  aes(rstime, fill=as.factor(cc.x)) +
  geom_density(colour="black", alpha=0.3)

sum(is.na(rstime))
## [1] 0
# balanced in cases and controls
t.test(refage~cc.x)
## 
##  Welch Two Sample t-test
## 
## data:  refage by cc.x
## t = -0.038983, df = 490.73, p-value = 0.9689
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.211407  1.164273
## sample estimates:
## mean in group 0 mean in group 1 
##        47.79757        47.82114

explore_rstime_and_offset

ggplot(pheno.df) + 
  aes(offset.x, fill=as.factor(cc.x)) +
  geom_histogram(colour="black", alpha=0.3, binwidth=1)

ggplot(pheno.df) + 
  geom_density(aes(offset.x), fill="red", colour="black", alpha=0.3) +
  geom_density(aes(rstime), fill="blue", colour="black", alpha=0.3)

ggplot(pheno.df) + 
  geom_density(aes(rstime), fill="red", colour="black", alpha=0.3) +
  geom_density(aes(refage - sub_dx_age.x), fill="blue", colour="black", alpha=0.3)

ggplot(pheno.df) + 
  aes(refage - sub_dx_age.x, fill=as.factor(cc.x)) +
  geom_histogram(colour="black", alpha=0.3, binwidth=1)

ggplot(pheno.df) + 
  aes(refage - sub_dx_age.x, fill=as.factor(cc.x)) +
  geom_density(colour="black", alpha=0.3)

ggplot(pheno.df) + 
  geom_density(aes(refage), fill="red", colour="black", alpha=0.3) +
  geom_density(aes(sub_dx_age.x), fill="blue", colour="black", alpha=0.3)

unknown_fields

# dsmiss

dsmiss[1:5]
## [1] 0 0 0 0 1
class(dsmiss)
## [1] "integer"
summary(dsmiss)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000000 0.000000 0.000000 0.006085 0.000000 1.000000
table(dsmiss)
## dsmiss
##   0   1 
## 490   3
sum(is.na(dsmiss))
## [1] 0
ggplot(pheno.df) + 
  aes(dsmiss, fill=as.factor(cc.x)) +
  geom_bar(colour="black", alpha=0.3)

# Balanced between cases and controls
fisher.test(table(cc.x,dsmiss))
## 
##  Fisher's Exact Test for Count Data
## 
## data:  table(cc.x, dsmiss)
## p-value = 1
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.008446842 9.676203318
## sample estimates:
## odds ratio 
##  0.5006798
ggplot(pheno.df) + 
  aes(as.factor(cc.x), fill=as.factor(dsmiss)) +
  geom_bar(colour="black", alpha=0.3)

# conf_miss

conf_miss[1:5]
## [1] 0 0 0 0 0
class(conf_miss)
## [1] "numeric"
summary(conf_miss)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000000 0.000000 0.000000 0.006085 0.000000 1.000000
table(conf_miss)
## conf_miss
##   0   1 
## 490   3
sum(is.na(conf_miss))
## [1] 0
ggplot(pheno.df) + 
  aes(conf_miss, fill=as.factor(cc.x)) +
  geom_bar(colour="black", alpha=0.3)

# Balanced between cases and controls
fisher.test(table(cc.x,conf_miss))
## 
##  Fisher's Exact Test for Count Data
## 
## data:  table(cc.x, conf_miss)
## p-value = 0.2485
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.000000 2.424045
## sample estimates:
## odds ratio 
##          0
ggplot(pheno.df) + 
  aes(as.factor(cc.x), fill=as.factor(conf_miss)) +
  geom_bar(colour="black", alpha=0.3)

# dsmiss vs conf_miss

sum(dsmiss != conf_miss, na.rm=TRUE)
## [1] 6
# Phase - all 1

Phase[1:5]
## [1] 1 1 1 1 1
class(Phase)
## [1] "numeric"
summary(Phase)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1       1       1       1       1       1
table(Phase)
## Phase
##   1 
## 493
sum(is.na(Phase))
## [1] 0
sum(Phase!=1)
## [1] 0
# Deleterious

Deleterious[1:5]
## [1] 0 0 0 0 0
class(Deleterious)
## [1] "integer"
summary(Deleterious)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.03651 0.00000 9.00000
table(Deleterious)
## Deleterious
##   0   9 
## 491   2
sum(is.na(Deleterious))
## [1] 0
ggplot(pheno.df) + 
  aes(as.factor(Deleterious), fill=as.factor(cc.x)) +
  geom_bar(colour="black", alpha=0.3)

# Balanced between cases and controls
fisher.test(table(cc.x,Deleterious))
## 
##  Fisher's Exact Test for Count Data
## 
## data:  table(cc.x, Deleterious)
## p-value = 0.2485
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.1886951       Inf
## sample estimates:
## odds ratio 
##        Inf
ggplot(pheno.df) + 
  aes(as.factor(cc.x), fill=as.factor(Deleterious)) +
  geom_bar(colour="black", alpha=0.3)

# Detach pheno data frame
detach(pheno.df)

reshape_pheno_table

# Select columns
pheno.df <- pheno.df %>% select(
  labid = labid.x,
  cc = cc.x,
  family_history = family_history.y,
  sub_dx_age = sub_dx_age.x,
  refage = refage,
  rstime = rstime,
  offset = offset.x,
  Eigen_1 = Eigen_1.x,
  Eigen_2 = Eigen_2.x,
  Eigen_3 = Eigen_3.x,
  Eigen_4 = Eigen_4.x,
  Eigen_5 = Eigen_5.x,
  stage_fd = stage_fd,
  er1 = er1,
  pr1_cat = pr1_cat,
  histo1_cat = histo1_cat,
  hormone = hormone,
  CMF = CMF,
  XRTBrCHAR = XRTBrCHAR,
  dose_caseloc = dose_caseloc,
  rh_age_menarche = rh_age_menarche,
  age_1fftp_fd = age_1fftp_fd, 
  age_menopause_1yrbf_fd = age_menopause_1yrbf_fd,
  num_preg = num_preg,
  BMI_age18 = BMI_age18,
  BMI_dx = BMI_dx,
  BMI_ref = BMI_ref)

# Recode categorical pr to NA, 0 and 1
pheno.df <- pheno.df %>% mutate(
  pr1 = ifelse(pr1_cat=="negative", 0,
        ifelse(pr1_cat=="positive", 1,
        NA)))

# Substitute zero age of menarche to NA
pheno.df <- pheno.df %>% mutate(
  age_menarche = ifelse(rh_age_menarche==0, NA, rh_age_menarche))

# Remove corrected and recoded fields
pheno.df <- pheno.df %>% select(-rh_age_menarche, -pr1_cat)

# Remove demographics table
rm(demographics.df, odd_cases, duplicated_cases)

merge_pheno_and_covar

# Check data frames
dim(covar.df)
## [1] 498  13
dim(pheno.df)
## [1] 493  27
colnames(covar.df)
##  [1] "labid"      "cc"         "sub_dx_age" "offset"     "chemo"     
##  [6] "hormone"    "xrt"        "dose"       "Eigen_1"    "Eigen_2"   
## [11] "Eigen_3"    "Eigen_4"    "Eigen_5"
colnames(pheno.df)
##  [1] "labid"                  "cc"                    
##  [3] "family_history"         "sub_dx_age"            
##  [5] "refage"                 "rstime"                
##  [7] "offset"                 "Eigen_1"               
##  [9] "Eigen_2"                "Eigen_3"               
## [11] "Eigen_4"                "Eigen_5"               
## [13] "stage_fd"               "er1"                   
## [15] "histo1_cat"             "hormone"               
## [17] "CMF"                    "XRTBrCHAR"             
## [19] "dose_caseloc"           "age_1fftp_fd"          
## [21] "age_menopause_1yrbf_fd" "num_preg"              
## [23] "BMI_age18"              "BMI_dx"                
## [25] "BMI_ref"                "pr1"                   
## [27] "age_menarche"
# Prepare key for joining
pheno.df$labid <- as.vector(pheno.df$labid)
covar.df$labid <- as.vector(covar.df$labid)

ph.df <- left_join(covar.df, pheno.df, by="labid")
dim(ph.df)
## [1] 498  39
colnames(ph.df)
##  [1] "labid"                  "cc.x"                  
##  [3] "sub_dx_age.x"           "offset.x"              
##  [5] "chemo"                  "hormone.x"             
##  [7] "xrt"                    "dose"                  
##  [9] "Eigen_1.x"              "Eigen_2.x"             
## [11] "Eigen_3.x"              "Eigen_4.x"             
## [13] "Eigen_5.x"              "cc.y"                  
## [15] "family_history"         "sub_dx_age.y"          
## [17] "refage"                 "rstime"                
## [19] "offset.y"               "Eigen_1.y"             
## [21] "Eigen_2.y"              "Eigen_3.y"             
## [23] "Eigen_4.y"              "Eigen_5.y"             
## [25] "stage_fd"               "er1"                   
## [27] "histo1_cat"             "hormone.y"             
## [29] "CMF"                    "XRTBrCHAR"             
## [31] "dose_caseloc"           "age_1fftp_fd"          
## [33] "age_menopause_1yrbf_fd" "num_preg"              
## [35] "BMI_age18"              "BMI_dx"                
## [37] "BMI_ref"                "pr1"                   
## [39] "age_menarche"
attach(ph.df)

sum(cc.x != cc.y, na.rm = TRUE)
## [1] 0
sum(is.na(cc.x)) # to be preserved
## [1] 0
sum(is.na(cc.y)) # to be removed
## [1] 5
sum(sub_dx_age.x != sub_dx_age.y, na.rm = TRUE)
## [1] 0
sum(is.na(sub_dx_age.x)) # to be preserved
## [1] 0
sum(is.na(sub_dx_age.y)) # to be removed
## [1] 5
sum(offset.x != offset.y, na.rm = TRUE)
## [1] 0
sum(is.na(offset.x)) # to be preserved
## [1] 0
sum(is.na(offset.y)) # to be removed
## [1] 5
sum(hormone.x != hormone.y, na.rm = TRUE)
## [1] 0
sum(is.na(hormone.x)) # to be preserved
## [1] 1
sum(is.na(hormone.y)) # to be removed
## [1] 6
sum(is.na(chemo)) # to be removed, substituted by CMF
## [1] 0
sum(Eigen_1.x != Eigen_1.y, na.rm = TRUE)
## [1] 0
sum(is.na(Eigen_1.x)) # to be preserved
## [1] 0
sum(is.na(Eigen_1.y)) # to be removed
## [1] 5
sum(Eigen_2.x != Eigen_2.y, na.rm = TRUE)
## [1] 0
sum(is.na(Eigen_2.x)) # to be preserved
## [1] 0
sum(is.na(Eigen_2.y)) # to be removed
## [1] 5
sum(Eigen_3.x != Eigen_3.y, na.rm = TRUE)
## [1] 0
sum(is.na(Eigen_3.x)) # to be preserved
## [1] 0
sum(is.na(Eigen_3.y)) # to be removed
## [1] 5
sum(Eigen_4.x != Eigen_4.y, na.rm = TRUE)
## [1] 0
sum(is.na(Eigen_4.x)) # to be preserved
## [1] 0
sum(is.na(Eigen_4.y)) # to be removed
## [1] 5
sum(Eigen_5.x != Eigen_5.y, na.rm = TRUE)
## [1] 0
sum(is.na(Eigen_5.x)) # to be preserved
## [1] 0
sum(is.na(Eigen_5.y)) # to be removed
## [1] 5
sum(is.na(family_history)) # to be preserved
## [1] 5
sum(is.na(refage)) # to be preserved
## [1] 5
sum(is.na(rstime)) # to be preserved
## [1] 5
sum(is.na(stage_fd)) # to be preserved
## [1] 5
sum(is.na(er1)) # to be preserved
## [1] 155
sum(is.na(histo1_cat)) # to be preserved
## [1] 5
sum(is.na(CMF)) # to be preserved
## [1] 5
sum(is.na(XRTBrCHAR)) # to be preserved
## [1] 5
sum(is.na(dose_caseloc)) # to be preserved
## [1] 8
sum(is.na(age_1fftp_fd)) # to be preserved
## [1] 5
sum(is.na(age_menopause_1yrbf_fd)) # to be preserved
## [1] 8
sum(is.na(num_preg)) # to be preserved
## [1] 5
sum(is.na(BMI_age18)) # to be preserved
## [1] 7
sum(is.na(BMI_dx)) # to be preserved
## [1] 8
sum(is.na(BMI_ref)) # to be preserved
## [1] 8
sum(is.na(pr1)) # to be preserved
## [1] 215
sum(is.na(age_menarche)) # to be preserved
## [1] 9
detach(ph.df)

ph.df <- ph.df %>% 
  select(
    cc = cc.x, 
    age_dx = sub_dx_age.x, 
    age_ref = refage, 
    rstime = rstime,
    offset = offset.x, 
    Eigen_1 = Eigen_1.x, 
    Eigen_2 = Eigen_2.x, 
    Eigen_3 = Eigen_3.x, 
    Eigen_4 = Eigen_4.x, 
    Eigen_5 = Eigen_5.x, 
    stage = stage_fd, 
    er1 = er1, 
    pr1 = pr1, 
    hist_cat = histo1_cat, 
    hormone = hormone.x, 
    chemo_cat = CMF, 
    br_xray = XRTBrCHAR, 
    br_xray_dose = dose_caseloc, 
    age_menarche = age_menarche, 
    age_1st_ftp = age_1fftp_fd, 
    age_menopause = age_menopause_1yrbf_fd, 
    num_preg = num_preg, 
    BMI_age18 = BMI_age18, 
    BMI_dx = BMI_dx, 
    BMI_ref = BMI_ref)

rm(pheno.df, covar.df)

save_data

save.image(file="data/s05_reshape_data_feb2016_tmp.RData")

final_section

sessionInfo()
## R version 3.2.3 (2015-12-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Scientific Linux release 6.7 (Carbon)
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ggplot2_2.1.0 stringr_1.0.0 dplyr_0.4.3   tidyr_0.4.1   knitr_1.12.3 
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.4      magrittr_1.5     munsell_0.4.3    colorspace_1.2-6
##  [5] R6_2.1.2         plyr_1.8.3       tools_3.2.3      parallel_3.2.3  
##  [9] grid_3.2.3       gtable_0.2.0     DBI_0.3.1        htmltools_0.3.5 
## [13] yaml_2.1.13      lazyeval_0.1.10  assertthat_0.1   digest_0.6.9    
## [17] reshape2_1.4.1   formatR_1.3      evaluate_0.8.3   rmarkdown_0.9.5 
## [21] labeling_0.3     stringi_1.0-1    scales_0.4.0
Sys.time()
## [1] "2016-06-02 21:35:32 BST"